From 6e16dce826b0c81fd7bc7cbfd7e5b1956df8a8d4 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Thu, 2 Jun 2016 00:04:53 -0400 Subject: [PATCH 01/11] OpenZFS 6950 - ARC should cache compressed data Authored by: George Wilson Reviewed by: Prakash Surya Reviewed by: Dan Kimmel Reviewed by: Matt Ahrens Reviewed by: Paul Dagnelie Ported by: David Quigley This review covers the reading and writing of compressed arc headers, sharing data between the arc_hdr_t and the arc_buf_t, and the implementation of a new dbuf cache to keep frequently access data uncompressed. I've added a new member to l1 arc hdr called b_pdata. The b_pdata always hangs off the arc_buf_hdr_t (if an L1 hdr is in use) and points to the physical block for that DVA. The physical block may or may not be compressed. If compressed arc is enabled and the block on-disk is compressed, then the b_pdata will match the block on-disk and remain compressed in memory. If the block on disk is not compressed, then neither will the b_pdata. Lastly, if compressed arc is disabled, then b_pdata will always be an uncompressed version of the on-disk block. Typically the arc will cache only the arc_buf_hdr_t and will aggressively evict any arc_buf_t's that are no longer referenced. This means that the arc will primarily have compressed blocks as the arc_buf_t's are considered overhead and are always uncompressed. When a consumer reads a block we first look to see if the arc_buf_hdr_t is cached. If the hdr is cached then we allocate a new arc_buf_t and decompress the b_pdata contents into the arc_buf_t's b_data. If the hdr already has a arc_buf_t, then we will allocate an additional arc_buf_t and bcopy the uncompressed contents from the first arc_buf_t to the new one. Writing to the compressed arc requires that we first discard the b_pdata since the physical block is about to be rewritten. The new data contents will be passed in via an arc_buf_t (uncompressed) and during the I/O pipeline stages we will copy the physical block contents to a newly allocated b_pdata. When an l2arc is inuse it will also take advantage of the b_pdata. Now the l2arc will always write the contents of b_pdata to the l2arc. This means that when compressed arc is enabled that the l2arc blocks are identical to those stored in the main data pool. This provides a significant advantage since we can leverage the bp's checksum when reading from the l2arc to determine if the contents are valid. If the compressed arc is disabled, then we must first transform the read block to look like the physical block in the main data pool before comparing the checksum and determining it's valid. OpenZFS Issue: https://www.illumos.org/issues/6950 --- cmd/zdb/zdb.c | 2 +- cmd/ztest/ztest.c | 7 + include/sys/arc.h | 93 +- include/sys/arc_impl.h | 49 +- include/sys/dbuf.h | 13 +- include/sys/refcount.h | 2 + include/sys/spa.h | 8 +- include/sys/trace_arc.h | 30 +- include/sys/trace_dbuf.h | 14 + include/sys/zio.h | 4 + include/sys/zio_checksum.h | 4 + man/man5/zfs-module-parameters.5 | 14 - module/zfs/arc.c | 3466 ++++++++++++++++-------------- module/zfs/dbuf.c | 658 ++++-- module/zfs/dbuf_stats.c | 2 +- module/zfs/dmu.c | 7 +- module/zfs/dmu_diff.c | 4 +- module/zfs/dmu_objset.c | 15 +- module/zfs/dmu_send.c | 8 +- module/zfs/dmu_traverse.c | 4 +- module/zfs/dnode.c | 2 +- module/zfs/dnode_sync.c | 4 +- module/zfs/dsl_scan.c | 6 +- module/zfs/refcount.c | 24 + module/zfs/zil.c | 6 +- module/zfs/zio.c | 6 +- module/zfs/zio_checksum.c | 57 +- 27 files changed, 2491 insertions(+), 2018 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 6a076e65da5a..611e9229461c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1297,7 +1297,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 852bf00a616b..1b77b6ceed97 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -192,6 +192,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5880,6 +5881,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } thread_exit(); diff --git a/include/sys/arc.h b/include/sys/arc.h index d8a85e830e15..13788a9b671c 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -44,12 +44,24 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); typedef void arc_prune_func_t(int64_t bytes, void *private); -typedef int arc_evict_func_t(void *private); /* Shared module parameters */ extern int zfs_arc_average_blocksize; @@ -58,6 +70,8 @@ extern int zfs_arc_average_blocksize; arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + /* generic arc_prune_func_t wrapper for callbacks */ struct arc_prune { arc_prune_func_t *p_pfunc; @@ -77,37 +91,54 @@ typedef enum arc_flags /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 } arc_flags_t; @@ -116,11 +147,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -154,7 +184,7 @@ typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; uint32_t abi_flags; - uint32_t abi_datacnt; + uint32_t abi_bufcnt; uint64_t abi_size; uint64_t abi_spa; uint64_t abi_access; @@ -171,13 +201,12 @@ typedef struct arc_buf_info { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, uint64_t size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); @@ -185,7 +214,6 @@ int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -194,8 +222,7 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, @@ -205,13 +232,11 @@ arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 5c57c3157ae8..c23187d6a62e 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,7 +74,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -140,11 +140,13 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ @@ -163,8 +165,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev { @@ -185,10 +186,7 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ uint32_t b_hits; - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -202,20 +200,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 07935891152d..bf546db6f985 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -303,8 +309,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/include/sys/refcount.h b/include/sys/refcount.h index d3f90fdc6277..ac82a4106dd1 100644 --- a/include/sys/refcount.h +++ b/include/sys/refcount.h @@ -70,6 +70,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); void refcount_init(void); void refcount_fini(void); @@ -97,6 +98,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) #define refcount_init() #define refcount_fini() diff --git a/include/sys/spa.h b/include/sys/spa.h index 51d4619f47ff..738b4db30f63 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -135,6 +135,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -363,8 +365,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 0fca639e1493..0384a44ab36d 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -50,9 +50,10 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_datacnt) + __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) - __field(uint64_t, hdr_size) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) @@ -68,8 +69,9 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; - __entry->hdr_size = ab->b_size; + __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; + __entry->hdr_psize = ab->b_psize; + __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; __entry->hdr_access = ab->b_l1hdr.b_arc_access; @@ -81,13 +83,13 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x datacnt %u type %u size %llu spa %llu " + "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, - __entry->hdr_spa, __entry->hdr_state_type, + __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, @@ -185,9 +187,10 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_datacnt) + __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) - __field(uint64_t, hdr_size) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) @@ -215,8 +218,9 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; - __entry->hdr_size = hdr->b_size; + __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; + __entry->hdr_psize = hdr->b_psize; + __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; __entry->hdr_access = hdr->b_l1hdr.b_arc_access; @@ -246,7 +250,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x datacnt %u size %llu spa %llu state_type %u " + "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -255,7 +259,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_size, + __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/include/sys/trace_dbuf.h b/include/sys/trace_dbuf.h index aca6d6531f5c..76274d152575 100644 --- a/include/sys/trace_dbuf.h +++ b/include/sys/trace_dbuf.h @@ -92,6 +92,20 @@ DEFINE_EVENT(zfs_dbuf_class, name, \ TP_ARGS(db, zio)) DEFINE_DBUF_EVENT(zfs_blocked__read); +DECLARE_EVENT_CLASS(zfs_dbuf_evict_one_class, + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), + TP_ARGS(db, mls), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk(DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS) +); + +#define DEFINE_DBUF_EVICT_ONE_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_evict_one_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), \ + TP_ARGS(db, mls)) +DEFINE_DBUF_EVICT_ONE_EVENT(zfs_dbuf__evict__one); + #endif /* _TRACE_DBUF_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/sys/zio.h b/include/sys/zio.h index d5a1494df2a5..53729f2840ed 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -527,6 +527,10 @@ extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void *zio_buf_alloc_flags(size_t size, int flags); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 9fcfd521f4ad..04573ba5456f 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -64,8 +64,12 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; */ extern zio_checksum_func_t zio_checksum_SHA256; +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index b4ad3700f3cf..3ccaf92a1113 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -96,20 +96,6 @@ successfully compressed before writing. A value of 100 disables this feature. Default value: \fB200\fR. .RE -.sp -.ne 2 -.na -\fBl2arc_max_block_size\fR (ulong) -.ad -.RS 12n -The maximum block size which may be written to an L2ARC device, after -compression and other factors. This setting is used to prevent a small -number of large blocks from pushing a larger number of small blocks out -of the cache. -.sp -Default value: \fB16,777,216\fR. -.RE - .sp .ne 2 .na diff --git a/module/zfs/arc.c b/module/zfs/arc.c index cc26b2e48bb0..bf73d51415ce 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -23,7 +23,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* @@ -128,9 +128,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include #include +#include #include +#include #include #include #include @@ -162,10 +287,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower @@ -224,6 +345,11 @@ static int arc_dead; */ static boolean_t arc_warm; +/* + * log2 fraction of the zio arena to keep free. + */ +int arc_zio_arena_free_shift = 2; + /* * These tunables are for performance analysis. */ @@ -236,9 +362,10 @@ unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +int zfs_compressed_arc_enabled = B_TRUE; + /* * These tunables are Linux specific */ @@ -307,6 +434,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually @@ -452,25 +599,17 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_writes_skip_toobig; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; @@ -521,6 +660,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -552,25 +694,17 @@ static arc_stats_t arc_stats = { { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_writes_skip_toobig", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, @@ -647,7 +781,7 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_no_grow ARCSTAT(arcstat_no_grow) +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ @@ -661,14 +795,16 @@ static arc_state_t *arc_l2c_only; #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static taskq_t *arc_prune_taskq; -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -678,25 +814,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -742,7 +888,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ -#define L2ARC_MAX_BLOCK_SIZE (16 * 1024 * 1024) /* max compress size */ /* * If we discover during ARC scan any buffers to be compressed, we boost @@ -752,17 +897,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ - -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -771,11 +905,9 @@ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -unsigned long l2arc_max_block_size = L2ARC_MAX_BLOCK_SIZE; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -int l2arc_nocompress = B_FALSE; /* don't compress bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ @@ -792,19 +924,17 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -812,7 +942,10 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); @@ -821,14 +954,12 @@ static void arc_prune_async(int64_t); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -846,14 +977,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -875,7 +1006,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -913,13 +1044,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -947,12 +1078,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1049,7 +1180,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1063,7 +1194,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { ASSERTV(arc_buf_hdr_t *hdr = vbuf); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1145,159 +1276,140 @@ buf_init(void) } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - dev = hdr->b_l2hdr.b_dev; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); +#define ARC_MINTIME (hz>>4) /* 62 ms */ - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); +} - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); - return (equal); + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + uint64_t lsize; + uint64_t csize; + void *cbuf; + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + + cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + lsize = HDR_GET_LSIZE(hdr); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -1315,7 +1427,7 @@ arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, + ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), PROT_READ | PROT_WRITE)); } #endif @@ -1327,18 +1439,22 @@ arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); + ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), + PROT_READ)); #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1360,50 +1476,245 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } - - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); - + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ +static void +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + arc_buf_t *buf; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + arc_buf_t *buf; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } state = hdr->b_l1hdr.b_state; @@ -1411,27 +1722,20 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_remove(list, hdr); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { @@ -1448,15 +1752,9 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_insert(list, hdr); - - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } @@ -1491,7 +1789,7 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; @@ -1502,14 +1800,12 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; - abi->abi_l2arc_asize = l2hdr->b_asize; - abi->abi_l2arc_compress = l2hdr->b_compress; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); - abi->abi_size = hdr->b_size; + abi->abi_size = arc_hdr_size(hdr); } /* @@ -1522,8 +1818,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, { arc_state_t *old_state; int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; + uint32_t bufcnt; + boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* @@ -1536,20 +1832,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1557,26 +1853,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; - /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or @@ -1586,39 +1873,39 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1627,35 +1914,54 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1664,9 +1970,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -1760,18 +2086,23 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - VERIFY3U(size, <=, spa_maxblocksize(spa)); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; @@ -1781,23 +2112,64 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} + +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -1814,7 +2186,7 @@ arc_loan_buf(spa_t *spa, uint64_t size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -1828,12 +2200,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -1842,244 +2214,394 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - atomic_add_64(&arc_loaned_bytes, hdr->b_size); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) +static void +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_type = type; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); - /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); } -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ +static void +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); - return; - } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); + refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - l2arc_data_free_t *df; + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); + /* + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. + */ + refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; + + /* + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. + */ + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { + arc_buf_t **bufp; arc_buf_hdr_t *hdr = buf->b_hdr; - - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); - } -} - -static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + arc_buf_t *lastbuf = NULL; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. */ - if (!HDR_HAS_L1HDR(hdr)) + if (buf->b_data != NULL) { + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_cksum_verify(buf); + arc_buf_unwatch(buf); + + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); + } + buf->b_data = NULL; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; + } + + /* only remove the buf if requested */ + if (!remove) return; + /* remove the buf from the hdr list */ + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); } + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + /* - * The header does not have compression enabled. This can be due - * to the buffer not being compressible, or because we're - * freeing the buffer before the second phase of - * l2arc_write_buffer() has started (which does the compression - * step). In either case, b_tmp_cdata does not point to a - * separately compressed buffer, so there's nothing to free (it - * points to the same buffer as the arc_buf_t's b_data field). + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. - */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); - ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size, zio_data_buf_free); + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - hdr->b_l1hdr.b_tmp_cdata = NULL; + return (hdr); } /* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. */ -static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { - arc_buf_t **bufp; + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - /* free up data associated with the buf */ - if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); - arc_cksum_verify(buf); - arc_buf_unwatch(buf); + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); - } + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - (void) refcount_remove_many(&state->arcs_size, size, buf); - buf->b_data = NULL; + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); - /* only remove the buf if requested */ - if (!remove) - return; + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; - buf->b_next = NULL; + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); - ASSERT(buf->b_efunc == NULL); + mutex_exit(&dev->l2ad_mtx); - /* clean up the buf */ - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); } static void @@ -2087,50 +2609,20 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); - /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() - */ - arc_buf_l2_cdata_free(hdr); - - /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). - */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); - - /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. - */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2138,13 +2630,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2168,33 +2663,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); } } @@ -2209,133 +2685,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } uint64_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2362,11 +2740,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_tmp_cdata field) during its write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. @@ -2377,11 +2755,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2394,6 +2774,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2413,7 +2794,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2421,37 +2801,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -2707,12 +3089,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -2724,7 +3106,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -2795,8 +3177,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -2823,8 +3205,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, static uint64_t arc_adjust_meta_balanced(void) { - int64_t adjustmnt, delta, prune = 0; - uint64_t total_evicted = 0; + int64_t delta, prune = 0; + uint64_t adjustmnt, total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); @@ -2839,8 +3221,9 @@ arc_adjust_meta_balanced(void) */ adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { - delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); + if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), + adjustmnt); total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } @@ -2855,23 +3238,26 @@ arc_adjust_meta_balanced(void) * simply decrement the amount of data evicted from the MRU. */ - if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { - delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); + if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]), + adjustmnt); total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); } adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + if (adjustmnt > 0 && + refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[type]); + refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { + if (adjustmnt > 0 && + refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[type]); + refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); } @@ -3167,36 +3553,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3216,9 +3579,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3361,15 +3721,15 @@ arc_available_memory(void) /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains - * above about 1/16th free. + * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. * - * Note: The 1/16th arena free requirement was put in place - * to aggressively evict memory from the arc in order to avoid - * memory fragmentation issues. + * Note that reducing the arc_zio_arena_free_shift keeps more virtual + * memory (in the zio_arena) free, which can avoid memory + * fragmentation issues. */ if (zio_arena != NULL) { - n = vmem_size(zio_arena, VMEM_FREE) - - (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + n = vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, + VMEM_ALLOC) >> arc_zio_arena_free_shift); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; @@ -3389,7 +3749,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3478,6 +3838,21 @@ arc_reclaim_thread(void) arc_tuning_update(); + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ +#ifndef __linux__ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); +#endif mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3547,60 +3922,13 @@ arc_reclaim_thread(void) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ spl_fstrans_unmark(cookie); thread_exit(); } -static void -arc_user_evicts_thread(void) -{ - fstrans_cookie_t cookie = spl_fstrans_mark(); - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig(&arc_user_evicts_cv, - &arc_user_evicts_lock, ddi_get_lbolt() + hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - spl_fstrans_unmark(cookie); - thread_exit(); -} - #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the @@ -3650,15 +3978,15 @@ arc_user_evicts_thread(void) static uint64_t arc_evictable_memory(void) { uint64_t arc_clean = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); uint64_t ghost_clean = - arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] + - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]; + refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]) + + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); if (arc_dirty >= arc_c_min) @@ -3834,18 +4162,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -3885,12 +4212,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -3898,11 +4226,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -3915,9 +4241,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -3927,6 +4254,37 @@ arc_get_data_buf(arc_buf_t *buf) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -3970,7 +4328,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } @@ -4007,7 +4365,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4080,8 +4438,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -4090,7 +4448,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -4098,18 +4456,30 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } } +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -4128,34 +4498,34 @@ arc_read_done(zio_t *zio) ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); - found = buf_hash_find(hdr->b_spa, zio->io_bp, - &hash_lock); + found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - if (BP_GET_LEVEL(zio->io_bp) > 0) - byteswap_uint64_array(buf->b_data, hdr->b_size); - else - dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); - arc_buf_watch(buf); + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -4169,31 +4539,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -4263,7 +4652,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4281,8 +4669,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4314,7 +4702,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4335,10 +4724,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; } @@ -4361,34 +4749,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4398,20 +4788,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; /* * Gracefully handle a damaged logical block size as a * checksum error. */ - if (size > spa_maxblocksize(spa)) { - ASSERT3P(buf, ==, NULL); + if (lsize > spa_maxblocksize(spa)) { rc = SET_ERROR(ECKSUM); goto out; } @@ -4420,8 +4809,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4431,26 +4821,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4462,53 +4835,60 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr_full_cache); } + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; } + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4517,6 +4897,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4524,19 +4909,15 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4558,15 +4939,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + lsize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4575,26 +4954,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, buf->b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + size, hdr->b_l1hdr.b_pdata, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4624,8 +4997,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); @@ -4678,20 +5051,6 @@ arc_remove_prune_callback(arc_prune_t *p) kmem_free(p, sizeof (*p)); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4707,85 +5066,38 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); - } else { + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); mutex_exit(hash_lock); - } - -} - -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); + mutex_exit(hash_lock); } - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); } /* @@ -4821,16 +5133,19 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4871,79 +5186,116 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + arc_buf_t *lastbuf = NULL; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - uint64_t *size; - ASSERT3P(state, !=, arc_l2c_only); - size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); - } - hdr->b_l1hdr.b_datacnt -= 1; + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_bufcnt = 1; nhdr->b_l1hdr.b_mru_hits = 0; nhdr->b_l1hdr.b_mru_ghost_hits = 0; nhdr->b_l1hdr.b_mfu_hits = 0; nhdr->b_l1hdr.b_mfu_ghost_hits = 0; nhdr->b_l1hdr.b_l2_hits = 0; - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - - nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -4962,8 +5314,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -4997,28 +5347,83 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + enum zio_compress compress; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); + arc_buf_unwatch(buf); + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -5049,9 +5454,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -5059,7 +5466,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -5068,7 +5475,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -5102,19 +5509,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5125,7 +5532,7 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, @@ -5135,16 +5542,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5153,7 +5558,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -5180,7 +5608,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) last_txg = txg; page_load = 0; } - /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to @@ -5259,12 +5686,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } @@ -5277,8 +5706,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -5331,7 +5762,7 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given @@ -5426,6 +5857,124 @@ arc_tuning_update(void) } +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); + + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { @@ -5442,9 +5991,6 @@ arc_init(void) cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -5488,6 +6034,7 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; @@ -5510,82 +6057,18 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - - arc_anon->arcs_state = ARC_STATE_ANON; - arc_mru->arcs_state = ARC_STATE_MRU; - arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; - arc_mfu->arcs_state = ARC_STATE_MFU; - arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; - arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; - - refcount_create(&arc_anon->arcs_size); - refcount_create(&arc_mru->arcs_size); - refcount_create(&arc_mru_ghost->arcs_size); - refcount_create(&arc_mfu->arcs_size); - refcount_create(&arc_mfu_ghost->arcs_size); - refcount_create(&arc_l2c_only->arcs_size); - + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); - arc_eviction_list = NULL; mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_reclaim_thread_exit = B_FALSE; + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5598,10 +6081,7 @@ arc_init(void) (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, defclsyspri); - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, defclsyspri); - - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5634,10 +6114,10 @@ arc_fini(void) #endif /* _KERNEL */ mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5645,22 +6125,10 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); - - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5685,27 +6153,7 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -5835,7 +6283,6 @@ arc_fini(void) * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers - * l2arc_nocompress skip compressing buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this @@ -5995,9 +6442,13 @@ l2arc_do_free_on_write(void) for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6020,13 +6471,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -6084,45 +6535,30 @@ l2arc_write_done(zio_t *zio) */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { - list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; - hdr->b_flags &= ~ARC_FLAG_L2CACHE; - - ARCSTAT_BUMP(arcstat_l2_writes_skip_toobig); - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - } else if (zio->io_error != 0) { + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -6149,43 +6585,36 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -6198,7 +6627,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -6211,9 +6640,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -6363,12 +6793,11 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -6389,37 +6818,23 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, buf_compress_minsz, - stats_size; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; - const boolean_t do_headroom_boost = *headroom_boost; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; - - /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. @@ -6440,13 +6855,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; + uint64_t asize, size; + void *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -6461,7 +6876,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6475,15 +6890,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -6507,187 +6914,98 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ZIO_FLAG_CANFAIL); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing) - */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; hdr->b_l2hdr.b_hits = 0; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; - /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. - */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); - - mutex_exit(hash_lock); - - write_sz += buf_sz; - write_asize += buf_a_sz; - } - - multilist_sublist_unlock(mls); - - if (full == B_TRUE) - break; - } - - /* No buffers selected for writing? */ - if (pio == NULL) { - ASSERT0(write_sz); - ASSERT(!HDR_HAS_L1HDR(head)); - kmem_cache_free(hdr_l2only_cache, head); - return (0); - } - - mutex_enter(&dev->l2ad_mtx); - - /* - * Note that elsewhere in this file arcstat_l2_asize - * and the used space on l2ad_vdev are updated using b_asize, - * which is not necessarily rounded up to the device block size. - * Too keep accounting consistent we do the same here as well: - * stats_size accumulates the sum of b_asize of the written buffers, - * while write_asize accumulates the sum of b_asize rounded up - * to the device block size. - * The latter sum is used only to validate the corectness of the code. - */ - stats_size = 0; - write_asize = 0; - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(hdr)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - } - - /* - * Pick up the buffer data we had previously stashed away - * (and now potentially also compressed). - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - buf_sz = hdr->b_l2hdr.b_asize; + ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + size = arc_hdr_size(hdr); - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - uint64_t buf_a_sz; + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* - * Buffers which are larger than l2arc_max_block_size - * after compression are skipped and removed from L2 - * eligibility. + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. */ - if (buf_sz > l2arc_max_block_size) { - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - continue; - } + if (!HDR_SHARED_DATA(hdr)) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(size); + } + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + l2arc_free_data_on_write(to_write, size, type); + } wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + hdr->b_l2hdr.b_daddr, size, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + write_sz += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); - (void) zio_nowait(wzio); - - stats_size += buf_sz; + write_asize += size; /* * Keep the clock hand suitably device-aligned. */ - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_asize += buf_a_sz; - dev->l2ad_hand += buf_a_sz; + asize = vdev_psize_to_asize(dev->l2ad_vdev, size); + write_psize += asize; + dev->l2ad_hand += asize; + + mutex_exit(hash_lock); + + (void) zio_nowait(wzio); } + + multilist_sublist_unlock(mls); + + if (full == B_TRUE) + break; } - mutex_exit(&dev->l2ad_mtx); + /* No buffers selected for writing? */ + if (pio == NULL) { + ASSERT0(write_sz); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + return (0); + } ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, stats_size); - vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -6705,186 +7023,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (write_asize); } -/* - * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The original l2hdr contents were left - * untouched and are ready for writing to an L2 device. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_compress_buf(arc_buf_hdr_t *hdr) -{ - void *cdata; - size_t csize, len, rounded; - l2arc_buf_hdr_t *l2hdr; - - ASSERT(HDR_HAS_L2HDR(hdr)); - - l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - ASSERT3P(cdata, !=, NULL); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, - cdata, l2hdr->b_asize); - - rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } else if (csize > 0 && csize < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression failed, release the compressed buffer. - * l2hdr will be left unmodified. - */ - zio_data_buf_free(cdata, len); - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - uint64_t csize; - void *cdata; - - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - enum zio_compress comp; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(HDR_HAS_L2HDR(hdr)); - comp = hdr->b_l2hdr.b_compress; - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (comp == ZIO_COMPRESS_OFF) { - /* - * In this case, b_tmp_cdata points to the same buffer - * as the arc_buf_t's b_data field. We don't want to - * free it, since the arc_buf_t will handle that. - */ - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else if (comp == ZIO_COMPRESS_EMPTY) { - /* - * In this case, b_tmp_cdata was compressed to an empty - * buffer, thus there's nothing to free and b_tmp_cdata - * should have been set to NULL in l2arc_write_buffers(). - */ - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - /* - * If the data was compressed, then we've allocated a - * temporary buffer for it, so now we need to release it. - */ - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - -} - /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -6897,7 +7035,6 @@ l2arc_feed_thread(void) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -6937,7 +7074,7 @@ l2arc_feed_thread(void) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -6970,7 +7107,7 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -7065,7 +7202,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list @@ -7154,7 +7291,6 @@ l2arc_stop(void) EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); -EXPORT_SYMBOL(arc_buf_remove_ref); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); @@ -7197,12 +7333,12 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); module_param(zfs_arc_p_min_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); -module_param(zfs_disable_dup_eviction, int, 0644); -MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction"); - module_param(zfs_arc_average_blocksize, int, 0444); MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size"); +module_param(zfs_compressed_arc_enabled, int, 0644); +MODULE_PARM_DESC(zfs_arc_average_blocksize, "Disable compressed arc buffers"); + module_param(zfs_arc_min_prefetch_lifespan, int, 0644); MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); @@ -7222,9 +7358,6 @@ MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); module_param(l2arc_headroom_boost, ulong, 0644); MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier"); -module_param(l2arc_max_block_size, ulong, 0644); -MODULE_PARM_DESC(l2arc_max_block_size, "Skip L2ARC buffers larger than N"); - module_param(l2arc_feed_secs, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); @@ -7234,9 +7367,6 @@ MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); module_param(l2arc_noprefetch, int, 0644); MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); -module_param(l2arc_nocompress, int, 0644); -MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers"); - module_param(l2arc_feed_again, int, 0644); MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d297506e97ff..938db4270796 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -45,6 +45,7 @@ #include #include #include +#include struct dbuf_hold_impl_data { /* Function arguments */ @@ -71,13 +72,13 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -89,9 +90,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +unsigned long dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -101,7 +169,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); + multilist_link_init(&db->db_cache_link); return (0); } @@ -113,6 +183,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -142,8 +213,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -158,7 +227,7 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) uint64_t idx; dmu_buf_impl_t *db; - hv = DBUF_HASH(os, obj, level, blkid); + hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); @@ -211,7 +280,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) dmu_buf_impl_t *dbf; blkid = db->db_blkid; - hv = DBUF_HASH(os, obj, level, blkid); + hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); @@ -245,7 +314,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; - hv = DBUF_HASH(db->db_objset, db->db.db_object, + hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; @@ -269,8 +338,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -358,17 +425,186 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + dmu_buf_impl_t *db; + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; - dbuf_clear(db); - dbuf_destroy(db); + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } + + void dbuf_init(void) { @@ -403,7 +639,7 @@ dbuf_init(void) goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); @@ -412,11 +648,31 @@ dbuf_init(void) dbuf_stats_init(h); + /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -438,8 +694,23 @@ dbuf_fini(void) #else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -598,7 +869,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -613,8 +884,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -625,6 +894,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -636,6 +906,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -704,7 +975,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -759,7 +1030,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -797,8 +1068,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -918,7 +1187,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -976,9 +1245,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1102,7 +1372,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1210,7 +1480,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1221,7 +1491,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1620,7 +1890,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1629,12 +1899,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1799,7 +2065,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1817,10 +2083,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1832,61 +2098,64 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1897,15 +2166,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); /* * If this dbuf is referenced from an indirect dbuf, @@ -2027,7 +2306,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -2076,7 +2355,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -2101,76 +2380,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2210,10 +2425,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2242,7 +2484,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2340,6 +2583,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2389,7 +2633,7 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); *(dh->dh_dbp) = NULL; -top: + /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); @@ -2425,18 +2669,8 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) return (SET_ERROR(ENOENT)); } - if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { - arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); - if (dh->dh_db->db_buf->b_data == NULL) { - dbuf_clear(dh->dh_db); - if (dh->dh_parent) { - dbuf_rele(dh->dh_parent, NULL); - dh->dh_parent = NULL; - } - goto top; - } + if (dh->dh_db->db_buf != NULL) ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); - } ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); @@ -2455,13 +2689,19 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); dbuf_set_data(dh->dh_db, - arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, + arc_alloc_buf(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, dh->dh_db->db.db_size); } } + if (multilist_link_active(&dh->dh_db->db_cache_link)) { + ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); + multilist_remove(&dbuf_cache, dh->dh_db); + (void) refcount_remove_many(&dbuf_cache_size, + dh->dh_db->db.db_size, dh->dh_db); + } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); @@ -2587,7 +2827,8 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { - VERIFY(refcount_add(&db->db_holds, tag) > 1); + int64_t holds = refcount_add(&db->db_holds, tag); + VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2658,8 +2899,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2704,55 +2947,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -3091,7 +3323,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3362,10 +3594,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3381,8 +3610,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3558,17 +3785,17 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, - dbuf_write_physdone, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + &zp, dbuf_write_ready, + children_ready_cb, dbuf_write_physdone, + dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); } } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); -EXPORT_SYMBOL(dbuf_evict); +EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); @@ -3583,7 +3810,6 @@ EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); -EXPORT_SYMBOL(dbuf_clear); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); @@ -3601,4 +3827,24 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); + + +module_param(dbuf_cache_max_bytes, ulong, 0644); +MODULE_PARM_DESC(dbuf_cache_max_bytes, + "Maximum size in bytes of the dbuf cache."); + +module_param(dbuf_cache_hiwater_pct, uint, 0644); +MODULE_PARM_DESC(dbuf_cache_hiwater_pct, + "Percentage over dbuf_cache_max_bytes when dbufs \ + much be evicted directly."); + +module_param(dbuf_cache_lowater_pct, uint, 0644); +MODULE_PARM_DESC(dbuf_cache_lowater_pct, + "Percentage below dbuf_cache_max_bytes \ + when the evict thread stop evicting dbufs."); + +module_param(dbuf_cache_max_shift, int, 0644); +MODULE_PARM_DESC(dbuf_cache_max_shift, + "Cap the size of the dbuf cache to log2 fraction of arc size."); + #endif diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 6f39f80e563a..ae8ba8682599 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -95,7 +95,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) abi.abi_state_type, abi.abi_state_contents, abi.abi_flags, - (ulong_t)abi.abi_datacnt, + (ulong_t)abi.abi_bufcnt, (u_longlong_t)abi.abi_size, (u_longlong_t)abi.abi_access, (ulong_t)abi.abi_mru_hits, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 6ed4743ee01c..7992e3049760 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1336,7 +1336,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1680,8 +1680,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2039,11 +2038,11 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 7665d1ca591d..982b96132cc8 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -146,7 +146,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 22ca84d96a27..ac98ab6f2165 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -358,8 +358,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -376,14 +374,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -392,7 +389,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -475,8 +472,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -787,7 +783,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -1183,7 +1179,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 56936cf235cb..07067ff1e44d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -647,7 +647,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -659,7 +659,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -684,7 +684,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (zfs_send_corrupt_data) { uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; @@ -713,7 +713,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 4c9459412ff3..65f82cbcd193 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -386,7 +386,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -610,7 +610,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 8015f54edc56..3a918c3be33e 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -514,7 +514,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 54066e2e3ffa..b19f50af9c72 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -421,7 +421,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); avl_remove(&dn->dn_dbufs, db_marker); @@ -445,7 +445,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 7389b4b1d30d..41b3ce79b04d 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -692,7 +692,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -722,7 +722,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -754,7 +754,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 4c460a200967..1903c59540d3 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -227,4 +227,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} #endif /* ZFS_DEBUG */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 863ccb930c37..c18807be0ae6 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -266,7 +266,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -303,7 +303,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3dd8cffe9ca2..545a43d81424 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -301,7 +301,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -319,7 +319,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -2390,7 +2390,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3a5c73a6a1e9..b05e787dcaac 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -187,25 +187,19 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); if (ci->ci_eck) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -244,32 +238,51 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) ci->ci_func[byteswap](data, size, &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); } From f7e79993ada498efe859bc4eb07d0a9b331e91de Mon Sep 17 00:00:00 2001 From: David Quigley Date: Mon, 11 Jul 2016 13:45:52 -0400 Subject: [PATCH 02/11] DLPX-40252 integrate EP-476 compressed zfs send/receive --- cmd/zfs/zfs_main.c | 9 +- cmd/zstreamdump/zstreamdump.c | 30 +- include/libzfs.h | 3 + include/libzfs_core.h | 5 +- include/sys/arc.h | 20 +- include/sys/arc_impl.h | 1 + include/sys/dmu.h | 5 +- include/sys/dmu_send.h | 10 +- include/sys/dsl_dataset.h | 2 + include/sys/refcount.h | 2 +- include/sys/zfs_ioctl.h | 23 +- include/sys/zio.h | 26 +- include/sys/zio_compress.h | 23 +- lib/libzfs/libzfs_sendrecv.c | 51 ++- lib/libzfs_core/libzfs_core.c | 11 +- man/man8/zfs.8 | 38 +- module/zfs/arc.c | 762 ++++++++++++++++++++++------------ module/zfs/dbuf.c | 173 ++++---- module/zfs/dmu.c | 37 +- module/zfs/dmu_objset.c | 11 +- module/zfs/dmu_send.c | 224 +++++++--- module/zfs/dsl_dataset.c | 8 + module/zfs/zfs_ioctl.c | 35 +- module/zfs/zio.c | 84 ++-- 24 files changed, 1052 insertions(+), 541 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 8a8f56af9928..6b9e650a29b4 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -261,7 +261,7 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " + return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] " "\n" "\tsend [-Le] [-i snapshot|bookmark] " "\n" @@ -3718,7 +3718,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:c")) != -1) { switch (c) { case 'i': if (fromname) @@ -3762,6 +3762,9 @@ zfs_do_send(int argc, char **argv) case 't': resume_token = optarg; break; + case 'c': + flags.compress = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3838,6 +3841,8 @@ zfs_do_send(int argc, char **argv) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags.embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags.compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index 08d52bb37a83..e0bc34542b77 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -27,7 +27,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -40,6 +40,7 @@ #include #include +#include #include /* @@ -252,6 +253,7 @@ main(int argc, char *argv[]) (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); + break; } } @@ -457,38 +459,50 @@ main(int argc, char *argv[]) drrw->drr_object = BSWAP_64(drrw->drr_object); drrw->drr_type = BSWAP_32(drrw->drr_type); drrw->drr_offset = BSWAP_64(drrw->drr_offset); - drrw->drr_length = BSWAP_64(drrw->drr_length); + drrw->drr_logical_size = + BSWAP_64(drrw->drr_logical_size); drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); drrw->drr_key.ddk_prop = BSWAP_64(drrw->drr_key.ddk_prop); + drrw->drr_compressed_size = + BSWAP_64(drrw->drr_compressed_size); } + + uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + /* * If this is verbose and/or dump output, * print info on the modified block */ if (verbose) { (void) printf("WRITE object = %llu type = %u " - "checksum type = %u\n" - " offset = %llu length = %llu " + "checksum type = %u compression type = %u\n" + " offset = %llu logical_size = %llu " + "compressed_size = %llu " + "payload_size = %llu " "props = %llx\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, drrw->drr_checksumtype, + drrw->drr_compressiontype, (u_longlong_t)drrw->drr_offset, - (u_longlong_t)drrw->drr_length, + (u_longlong_t)drrw->drr_logical_size, + (u_longlong_t)drrw->drr_compressed_size, + (u_longlong_t)payload_size, (u_longlong_t)drrw->drr_key.ddk_prop); } + /* * Read the contents of the block in from STDIN to buf */ - (void) ssread(buf, drrw->drr_length, &zc); + (void) ssread(buf, payload_size, &zc); /* * If in dump mode */ if (dump) { - print_block(buf, drrw->drr_length); + print_block(buf, payload_size); } - total_write_size += drrw->drr_length; + total_write_size += payload_size; break; case DRR_WRITE_BYREF: diff --git a/include/libzfs.h b/include/libzfs.h index f83e21423aa0..7acd93bb9927 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -631,6 +631,9 @@ typedef struct sendflags { /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; + + /* compressed WRITE records are permitted */ + boolean_t compress; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 9e761004aa36..bc0f115bbfdf 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -54,13 +54,14 @@ int lzc_get_holds(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, - LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1 + LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, + LZC_SEND_FLAG_COMPRESS = 1 << 2 }; int lzc_send(const char *, const char *, int, enum lzc_send_flags); int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t); -int lzc_send_space(const char *, const char *, uint64_t *); +int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); struct dmu_replay_record; diff --git a/include/sys/arc.h b/include/sys/arc.h index 13788a9b671c..97529c3fc3ae 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -142,11 +142,17 @@ typedef enum arc_flags } arc_flags_t; +typedef enum arc_buf_flags { + ARC_BUF_FLAG_SHARED = 1 << 0, + ARC_BUF_FLAG_COMPRESSED = 1 << 1 +} arc_buf_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; + arc_buf_flags_t b_prop_flags; }; typedef enum arc_buf_contents { @@ -201,14 +207,22 @@ typedef struct arc_buf_info { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, - arc_buf_contents_t type); -arc_buf_t *arc_loan_buf(spa_t *spa, uint64_t size); +boolean_t arc_is_metadata(arc_buf_t *buf); +enum zio_compress arc_get_compression(arc_buf_t *buf); +int arc_decompress(arc_buf_t *buf); +arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, + int32_t size); +arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, + uint64_t psize, uint64_t lsize, enum zio_compress compression_type); +arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); +arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_destroy(arc_buf_t *buf, void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); +uint64_t arc_buf_lsize(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index c23187d6a62e..d2dc527feb4e 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -92,6 +92,7 @@ struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; + boolean_t acb_compressed; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index a8ed2868f744..83919a624c5a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -421,8 +422,8 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 -void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, - struct zio_prop *zp); +void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, + enum zio_compress compress_override, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 871f5625460e..e9bef8bddb73 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -41,14 +41,14 @@ struct dmu_replay_record; extern const char *recv_clone_name; int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, - struct vnode *vp, offset_t *off); + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, struct vnode *vp, offset_t *off); int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, - uint64_t *sizep); + boolean_t stream_compressed, uint64_t *sizep); int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, - uint64_t *sizep); + boolean_t stream_compressed, uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, struct vnode *vp, offset_t *off); typedef struct dmu_recv_cookie { diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 4d2f5e418bee..eb0c6838b952 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -108,7 +108,9 @@ struct dsl_pool; #define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" #define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" #define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" #define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" +#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose diff --git a/include/sys/refcount.h b/include/sys/refcount.h index ac82a4106dd1..580976c912bf 100644 --- a/include/sys/refcount.h +++ b/include/sys/refcount.h @@ -98,7 +98,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } -#define refcount_transfer_ownership(rc, current_holder, new_holder) +#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 #define refcount_init() #define refcount_fini() diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 5157c6704f4b..3ec812ac0f51 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -96,20 +96,21 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) -#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) +#define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 21) +#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ - DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ + DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_LARGE_DNODE) + DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -162,6 +163,12 @@ typedef enum dmu_send_resume_token_version { #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) +/* deal with compressed drr_write replay records */ +#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) +#define DRR_WRITE_PAYLOAD_SIZE(drrw) \ + (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ + (drrw)->drr_logical_size) + /* * zfs ioctl command structure */ @@ -210,12 +217,16 @@ typedef struct dmu_replay_record { dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; - uint64_t drr_length; + uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_checksumflags; - uint8_t drr_pad2[6]; - ddt_key_t drr_key; /* deduplication key */ + uint8_t drr_compressiontype; + uint8_t drr_pad2[5]; + /* deduplication key */ + ddt_key_t drr_key; + /* only nonzero if drr_compressiontype is not 0 */ + uint64_t drr_compressed_size; /* content follows */ } drr_write; struct drr_free { diff --git a/include/sys/zio.h b/include/sys/zio.h index 53729f2840ed..e12844f0c849 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -98,26 +98,6 @@ enum zio_checksum { #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 -enum zio_compress { - ZIO_COMPRESS_INHERIT = 0, - ZIO_COMPRESS_ON, - ZIO_COMPRESS_OFF, - ZIO_COMPRESS_LZJB, - ZIO_COMPRESS_EMPTY, - ZIO_COMPRESS_GZIP_1, - ZIO_COMPRESS_GZIP_2, - ZIO_COMPRESS_GZIP_3, - ZIO_COMPRESS_GZIP_4, - ZIO_COMPRESS_GZIP_5, - ZIO_COMPRESS_GZIP_6, - ZIO_COMPRESS_GZIP_7, - ZIO_COMPRESS_GZIP_8, - ZIO_COMPRESS_GZIP_9, - ZIO_COMPRESS_ZLE, - ZIO_COMPRESS_LZ4, - ZIO_COMPRESS_FUNCTIONS -}; - /* * The number of "legacy" compression functions which can be set on individual * objects. @@ -407,6 +387,8 @@ struct zio { void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; + /* io_lsize != io_orig_size iff this is a raw write */ + uint64_t io_lsize; /* Data represented by this I/O */ void *io_data; @@ -464,11 +446,11 @@ extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, + uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 63863c713c18..da58ef7aa5ec 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -22,17 +22,36 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H -#include - #ifdef __cplusplus extern "C" { #endif +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_FUNCTIONS +}; + /* Common signature for all zio compress functions. */ typedef size_t zio_compress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 0d19c3bf4d6a..15cc5e08da3a 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -352,8 +352,10 @@ cksummer(void *arg) { struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; + uint64_t payload_size; - (void) ssread(buf, drrw->drr_length, ofp); + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + (void) ssread(buf, payload_size, ofp); /* * Use the existing checksum if it's dedup-capable, @@ -366,7 +368,7 @@ cksummer(void *arg) zio_cksum_t tmpsha256; zio_checksum_SHA256(buf, - drrw->drr_length, &tmpsha256); + payload_size, &tmpsha256); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); @@ -396,7 +398,7 @@ cksummer(void *arg) wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; - wbr_drrr->drr_length = drrw->drr_length; + wbr_drrr->drr_length = drrw->drr_logical_size; wbr_drrr->drr_toguid = drrw->drr_toguid; wbr_drrr->drr_refguid = dataref.ref_guid; wbr_drrr->drr_refobject = @@ -418,7 +420,7 @@ cksummer(void *arg) goto out; } else { /* block not previously seen */ - if (dump_record(drr, buf, drrw->drr_length, + if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) goto out; } @@ -841,7 +843,7 @@ typedef struct send_dump_data { uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; - boolean_t large_block; + boolean_t large_block, compress; int outfd; boolean_t err; nvlist_t *fss; @@ -857,7 +859,7 @@ typedef struct send_dump_data { static int estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, - boolean_t fromorigin, uint64_t *sizep) + boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; @@ -870,6 +872,7 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_guid = 1; /* estimate flag */ + zc.zc_flags = flags; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; @@ -1108,6 +1111,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; + enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; @@ -1136,6 +1140,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) if (istosnap) sdd->seento = B_TRUE; + if (sdd->large_block) + flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (sdd->embed_data) + flags |= LZC_SEND_FLAG_EMBED_DATA; + if (sdd->compress) + flags |= LZC_SEND_FLAG_COMPRESS; + if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; @@ -1182,7 +1193,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) if (sdd->verbose) { uint64_t size = 0; (void) estimate_ioctl(zhp, sdd->prevsnap_obj, - fromorigin, &size); + fromorigin, flags, &size); send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, @@ -1207,12 +1218,6 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) } } - enum lzc_send_flags flags = 0; - if (sdd->large_block) - flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (sdd->embed_data) - flags |= LZC_SEND_FLAG_EMBED_DATA; - err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); @@ -1518,8 +1523,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress || nvlist_exists(resume_nvl, "compressok")) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { @@ -1552,7 +1561,8 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, if (flags->verbose) { uint64_t size = 0; - error = lzc_send_space(zhp->zfs_name, fromname, &size); + error = lzc_send_space(zhp->zfs_name, fromname, + lzc_flags, &size); if (error == 0) size = MAX(0, (int64_t)(size - bytes)); send_print_verbose(stderr, zhp->zfs_name, fromname, @@ -1781,6 +1791,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; + sdd.compress = flags->compress; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) @@ -2876,11 +2887,17 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) case DRR_WRITE: if (byteswap) { - drr->drr_u.drr_write.drr_length = - BSWAP_64(drr->drr_u.drr_write.drr_length); + drr->drr_u.drr_write.drr_logical_size = + BSWAP_64( + drr->drr_u.drr_write.drr_logical_size); + drr->drr_u.drr_write.drr_compressed_size = + BSWAP_64( + drr->drr_u.drr_write.drr_compressed_size); } + uint64_t payload_size = + DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); (void) recv_read(hdl, fd, buf, - drr->drr_u.drr_write.drr_length, B_FALSE, NULL); + payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 74370f40123b..4fad64eafe0a 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -484,6 +484,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd, fnvlist_add_string(args, "fromsnap", from); if (flags & LZC_SEND_FLAG_LARGE_BLOCK) fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); if (resumeobj != 0 || resumeoff != 0) { @@ -511,7 +513,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd, * an equivalent snapshot. */ int -lzc_send_space(const char *snapname, const char *from, uint64_t *spacep) +lzc_send_space(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; @@ -520,6 +523,12 @@ lzc_send_space(const char *snapname, const char *from, uint64_t *spacep) args = fnvlist_alloc(); if (from != NULL) fnvlist_add_string(args, "from", from); + if (flags & LZC_SEND_FLAG_LARGE_BLOCK) + fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 6921e3b4e119..e13fc1a52143 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -175,7 +175,7 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBsend\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR +\fBzfs\fR \fBsend\fR [\fB-DnPpRveLc\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .fi .LP @@ -2687,7 +2687,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the .sp .ne 2 .na -\fBzfs send\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR +\fBzfs send\fR [\fB-DnPpRveLc\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .ad .sp .6 .RS 4n @@ -2768,6 +2768,22 @@ then the receiving system must have that feature enabled as well. See \fBembedded_data\fR feature. .RE +.sp +.ne 2 +.na +\fB\fB-c\fR, \fB--compressed\fR\fR +.ad +.sp .6 +.RS 4n +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory (see the \fBcompression\fR property +for details). If the \fBlz4_compress\fR feature is active on the sending +system, then the receiving system must have that feature enabled as well. If +the \fBlarge_blocks\fR feature is enabled on the sending system but the \fB-L\fR +option is not supplied in conjunction with \fB-c\fR, then the data will be +decompressed before sending so it can be split into smaller block sizes. +.RE + .sp .ne 2 .na @@ -2820,7 +2836,7 @@ The format of the stream is committed. You will be able to receive your streams .sp .ne 2 .na -\fBzfs send\fR [\fB-Le\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +\fBzfs send\fR [\fB-Lec\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .ad .sp .6 .RS 4n @@ -2862,6 +2878,22 @@ then the receiving system must have that feature enabled as well. See \fBembedded_data\fR feature. .RE +.sp +.ne 2 +.na +\fB\fB-c\fR, \fB--compressed\fR\fR +.ad +.sp .6 +.RS 4n +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory (see the \fBcompression\fR property +for details). If the \fBlz4_compress\fR feature is active on the sending +system, then the receiving system must have that feature enabled as well. If +the \fBlarge_blocks\fR feature is enabled on the sending system but the \fB-L\fR +option is not supplied in conjunction with \fB-c\fR, then the data will be +decompressed before sending so it can be split into smaller block sizes. +.RE + .sp .ne 2 .na diff --git a/module/zfs/arc.c b/module/zfs/arc.c index bf73d51415ce..959d3fe88034 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -77,10 +77,10 @@ * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface - * uses method 1, while the internal arc algorithms for + * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the - * arc list locks. + * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most @@ -93,21 +93,12 @@ * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * - * Each arc state also has a mutex which is used to protect the + * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an arc list lock you + * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * - * Arc buffers may have an associated eviction callback function. - * This function will be invoked prior to removing the buffer (e.g. - * in arc_do_user_evicts()). Note however that the data associated - * with the buffer may be evicted prior to the callback. The callback - * must be made with *no locks held* (to prevent deadlock). Additionally, - * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_clear_callback() - * and arc_do_user_evicts(). - * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so @@ -144,67 +135,81 @@ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC - * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). - * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC - * consumer, and always contains uncompressed data. The ARC will provide - * references to this data and will keep it cached until it is no longer in - * use. Typically, the arc will try to cache only the L1ARC's physical data - * block and will aggressively evict any arc_buf_t that is no longer referenced. - * The amount of memory consumed by the arc_buf_t's can be seen via the + * + * The L1ARC's data pointer may or may not be uncompressed. The ARC has the + * ability to store the physical data (b_pdata) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * it will match its on-disk compression characteristics. This behavior can be + * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * Data in the L1ARC is not accessed by consumers of the ARC directly. Each + * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. + * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer. The ARC will provide references to this data and will keep it + * cached until it is no longer in use. The ARC caches only the L1ARC's physical + * data block and will evict any arc_buf_t that is no longer referenced. The + * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * + * Depending on the consumer, an arc_buf_t can be requested in uncompressed or + * compressed form. The typical case is that consumers will want uncompressed + * data, and when that happens a new data buffer is allocated where the data is + * decompressed for them to use. Currently the only consumer who wants + * compressed arc_buf_t's is "zfs send", when it streams data exactly as it + * exists on disk. When this happens, the arc_buf_t's data buffer is shared + * with the arc_buf_hdr_t. * - * arc_buf_hdr_t - * +-----------+ - * | | - * | | - * | | - * +-----------+ - * l2arc_buf_hdr_t| | - * | | - * +-----------+ - * l1arc_buf_hdr_t| | - * | | arc_buf_t - * | b_buf +------------>+---------+ arc_buf_t - * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL - * +-----------+ | | | +---------+ - * | |b_data +-+ | | - * | +---------+ | |b_data +-+ - * +->+------+ | +---------+ | - * (potentially) | | | | - * compressed | | | | - * data +------+ | v - * +->+------+ +------+ - * uncompressed | | | | - * data | | | | - * +------+ +------+ + * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The + * first one is owned by a compressed send consumer (and therefore references + * the same compressed data buffer as the arc_buf_hdr_t) and the second could be + * used by any other consumer (and has its own uncompressed copy of the data + * buffer). * - * The L1ARC's data pointer, however, may or may not be uncompressed. The - * ARC has the ability to store the physical data (b_pdata) associated with - * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk - * physical block, it will match its on-disk compression characteristics. - * If the block on-disk is compressed, then the physical data block - * in the cache will also be compressed and vice-versa. This behavior - * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an - * uncompressed version of the on-disk data. + * arc_buf_hdr_t + * +-----------+ + * | fields | + * | common to | + * | L1- and | + * | L2ARC | + * +-----------+ + * | l2arc_buf_hdr_t + * | | + * +-----------+ + * | l1arc_buf_hdr_t + * | | arc_buf_t + * | b_buf +------------>+-----------+ arc_buf_t + * | b_pdata +-+ |b_next +---->+-----------+ + * +-----------+ | |-----------| |b_next +-->NULL + * | |b_comp = T | +-----------+ + * | |b_data +-+ |b_comp = F | + * | +-----------+ | |b_data +-+ + * +->+------+ | +-----------+ | + * compressed | | | | + * data | |<--------------+ | uncompressed + * +------+ compressed, | data + * shared +-->+------+ + * data | | + * | | + * +------+ * * When a consumer reads a block, the ARC must first look to see if the - * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, - * then an additional arc_buf_t is allocated and the uncompressed data is - * bcopied from the existing arc_buf_t. If the hdr is cached but does not - * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses - * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's - * b_pdata is not compressed, then the block is shared with the newly - * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t - * in the arc buffer chain. Sharing the block reduces the memory overhead - * required when the hdr is caching uncompressed blocks or the compressed - * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new + * arc_buf_t and either copies uncompressed data into a new data buffer from an + * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a + * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * hdr is compressed and the desired compression characteristics of the + * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the + * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be + * the last buffer in the hdr's b_buf list, however a shared compressed buf can + * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is - * sharing its data with an arc_buf_t: + * sharing its data with an arc_buf_t (note that the shared uncompressed buf is + * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ @@ -233,20 +238,24 @@ * | +------+ | * +---------------------------------+ * - * Writing to the arc requires that the ARC first discard the b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pdata * since the physical block is about to be rewritten. The new data contents - * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline - * performs the write, it may compress the data before writing it to disk. - * The ARC will be called with the transformed data and will bcopy the - * transformed on-disk block into a newly allocated b_pdata. + * will be contained in the arc_buf_t. As the I/O pipeline performs the write, + * it may compress the data before writing it to disk. The ARC will be called + * with the transformed data and will bcopy the transformed on-disk block into + * a newly allocated b_pdata. Writes are always done into buffers which have + * either been loaned (and hence are new and don't have other readers) or + * buffers which have been released (and hence have their own hdr, if there + * were originally other readers of the buf's original hdr). This ensures that + * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pdata. The * L2ARC will always write the contents of b_pdata to the L2ARC. This means - * that when compressed arc is enabled that the L2ARC blocks are identical + * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed - * arc is disabled, then the L2ARC's block must be transformed to look + * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ @@ -842,6 +851,8 @@ static taskq_t *arc_prune_taskq; HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) +#define ARC_BUF_SHARED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes @@ -924,7 +935,7 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ + arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ @@ -1278,12 +1289,39 @@ buf_init(void) #define ARC_MINTIME (hz>>4) /* 62 ms */ +/* + * This is the size that the buf occupies in memory. If the buf is compressed, + * it will correspond to the compressed size. You should use this method of + * getting the buf size unless you explicitly need the logical size. + */ +uint64_t +arc_buf_size(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); +} + +uint64_t +arc_buf_lsize(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +enum zio_compress +arc_get_compression(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + IMPLY(shared, ARC_BUF_SHARED(buf)); + IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); return (shared); } @@ -1315,7 +1353,8 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), &zc); + + fletcher_2_native(buf->b_data, arc_buf_size(buf), &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); @@ -1400,14 +1439,22 @@ arc_cksum_compute(arc_buf_t *buf) return; ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } + + ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), + fletcher_2_native(buf->b_data, arc_buf_size(buf), hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); @@ -1439,7 +1486,7 @@ arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) - ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), + ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #endif } @@ -1457,6 +1504,12 @@ arc_buf_type(arc_buf_hdr_t *hdr) return (type); } +boolean_t +arc_is_metadata(arc_buf_t *buf) +{ + return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); +} + static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { @@ -1478,14 +1531,23 @@ arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (hdr->b_l1hdr.b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(hdr)) - panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } + /* + * Compressed buffers do not manipulate the b_freeze_cksum or + * allocate b_thawed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + hdr->b_l1hdr.b_bufcnt > 1); + return; + } + ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); @@ -1500,6 +1562,12 @@ arc_buf_freeze(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + hdr->b_l1hdr.b_bufcnt > 1); + return; + } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -1508,7 +1576,6 @@ arc_buf_freeze(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); - } /* @@ -1565,16 +1632,14 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) } } -static int +int arc_decompress(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; int error; - if (arc_buf_is_shared(buf)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { /* * The arc_buf_hdr_t is either not compressed or is * associated with an embedded block or a hole in which @@ -1582,11 +1647,31 @@ arc_decompress(arc_buf_t *buf) */ IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + if (!arc_buf_is_shared(buf)) { + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + HDR_GET_LSIZE(hdr)); + } } else { - ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + + /* + * If the buf is compressed and sharing data with hdr, unlink + * its data buf from the header and make it uncompressed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + buf->b_prop_flags &= + ~(ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Previously this buf was shared so overhead was 0, so + * just add new overhead. + */ + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); @@ -1633,7 +1718,6 @@ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1642,7 +1726,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + (void) refcount_add_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); return; } @@ -1652,11 +1737,11 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } - (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); + (void) refcount_add_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); } } @@ -1666,10 +1751,9 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) * so that we can add and remove them from the refcount individually. */ static void -arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1679,7 +1763,7 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], - lsize, hdr); + HDR_GET_LSIZE(hdr), hdr); return; } @@ -1689,12 +1773,11 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); (void) refcount_remove_many(&state->arcs_esize[type], - lsize, buf); + arc_buf_size(buf), buf); } } @@ -1724,7 +1807,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); - arc_evitable_space_decrement(hdr, state); + arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); @@ -1861,7 +1944,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } - arc_evitable_space_decrement(hdr, old_state); + arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* @@ -1924,13 +2007,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, + arc_buf_size(buf)); (void) refcount_add_many(&new_state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -1947,6 +2030,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); /* * When moving a header off of a ghost state, @@ -1958,7 +2042,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -1980,13 +2063,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, + arc_buf_size(buf)); (void) refcount_remove_many( - &old_state->arcs_size, HDR_GET_LSIZE(hdr), + &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -2087,11 +2170,11 @@ arc_space_return(uint64_t space, arc_space_type_t type) } /* - * Allocate an initial buffer for this hdr, subsequent buffers will - * use arc_buf_clone(). + * Allocate either the first buffer for this hdr, or a compressed buffer for + * this hdr. Subsequent non-compressed buffers use arc_buf_clone(). */ static arc_buf_t * -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed) { arc_buf_t *buf; @@ -2100,9 +2183,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; @@ -2112,7 +2192,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_next = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; add_reference(hdr, tag); @@ -2123,19 +2203,30 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * If the hdr's data can be shared (no byteswapping, hdr is - * uncompressed, hdr's data is not currently being written to the - * L2ARC write) then we share the data buffer and set the appropriate - * bit in the hdr's b_flags to indicate the hdr is sharing it's - * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to - * store the buf's data. + * If the hdr's data can be shared (no byteswapping, hdr compression + * matches the requested buf compression) then we share the data buffer + * and set the appropriate bit in the hdr's b_flags to indicate + * the hdr is sharing it's b_pdata with the arc_buf_t. Otherwise, we + * allocate a new buffer to store the buf's data. */ - if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && compressed && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(!HDR_SHARED_DATA(hdr)); + buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_prop_flags = + ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !compressed && HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(ARC_BUF_LAST(buf)); buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_prop_flags = ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!compressed); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + buf->b_prop_flags = 0; ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } @@ -2159,10 +2250,12 @@ arc_buf_clone(arc_buf_t *from) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!ARC_BUF_COMPRESSED(from)); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; + buf->b_prop_flags = 0; buf->b_next = hdr->b_l1hdr.b_buf; hdr->b_l1hdr.b_buf = buf; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); @@ -2182,16 +2275,27 @@ static char *arc_onloan_tag = "onloan"; * freed. */ arc_buf_t * -arc_loan_buf(spa_t *spa, uint64_t size) +arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { - arc_buf_t *buf; - - buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, + is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); atomic_add_64(&arc_loaned_bytes, size); return (buf); } +arc_buf_t * +arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, + psize, lsize, compression_type); + + atomic_add_64(&arc_loaned_bytes, psize); + return (buf); +} + + /* * Return a loaned arc buffer to the arc. */ @@ -2205,7 +2309,7 @@ arc_return_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); + atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -2219,7 +2323,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); + atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } static void @@ -2276,6 +2380,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pdata = buf->b_data; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_prop_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need @@ -2284,7 +2389,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void @@ -2302,6 +2407,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); hdr->b_l1hdr.b_pdata = NULL; + buf->b_prop_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between @@ -2309,21 +2415,59 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + * Remove an arc_buf_t from the hdr's buf list and return the last + * arc_buf_t on the list. If no buffers remain on the list then return + * NULL. + */ +static arc_buf_t * +arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; + arc_buf_t *lastbuf = NULL; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Remove the buf from the hdr list and locate the last + * remaining buffer on the list. + */ + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); + IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); + + return (lastbuf); +} + +/* + * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's + * list and free it. */ static void -arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf) { - arc_buf_t **bufp; + arc_buf_t *lastbuf; arc_buf_hdr_t *hdr = buf->b_hdr; - arc_buf_t *lastbuf = NULL; - uint64_t size = HDR_GET_LSIZE(hdr); - boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); /* * Free up the data associated with the buf but only @@ -2338,14 +2482,15 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - arc_cksum_verify(buf); + if (!ARC_BUF_COMPRESSED(buf)) { + arc_cksum_verify(buf); + } arc_buf_unwatch(buf); - if (destroyed_buf_is_shared) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(HDR_SHARED_DATA(hdr)); + if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } @@ -2355,53 +2500,53 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) hdr->b_l1hdr.b_bufcnt -= 1; } - /* only remove the buf if requested */ - if (!remove) - return; - - /* remove the buf from the hdr list */ - bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != NULL) { - if (*bufp == buf) - *bufp = buf->b_next; + lastbuf = arc_buf_remove(hdr, buf); + if (ARC_BUF_COMPRESSED(buf)) { /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. + * For compressed, shared buffers we don't need to do anything + * special so take the opportunity to ensure that compressed + * buffers must be shared. The hdr has already been marked as + * not shared and we already cleared b_data, so just check the + * flag on the buf. */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); + VERIFY(ARC_BUF_SHARED(buf)); + } else if (ARC_BUF_SHARED(buf)) { + ASSERT(!ARC_BUF_COMPRESSED(buf)); - /* - * If the current arc_buf_t is sharing its data - * buffer with the hdr, then reassign the hdr's - * b_pdata to share it with the new buffer at the end - * of the list. The shared buffer is always the last one - * on the hdr's buffer list. - */ - if (destroyed_buf_is_shared && lastbuf != NULL) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(ARC_BUF_LAST(lastbuf)); - VERIFY(!arc_buf_is_shared(lastbuf)); + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (lastbuf != NULL) { + VERIFY(!arc_buf_is_shared(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } + } else if (HDR_SHARED_DATA(hdr)) { /* - * We must setup a new shared block between the - * last buffer and the hdr. The data would have - * been allocated by the arc buf so we need to transfer - * ownership to the hdr since it's now being shared. + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. */ - arc_share_buf(hdr, lastbuf); - } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + ASSERT3P(lastbuf, !=, NULL); + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); } if (hdr->b_l1hdr.b_bufcnt == 0) @@ -2456,11 +2601,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compress, arc_buf_contents_t type) + enum zio_compress compression_type, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - ASSERT3U(lsize, >, 0); VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); @@ -2472,7 +2616,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); - arc_hdr_set_compress(hdr, compress); + arc_hdr_set_compress(hdr, compression_type); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; @@ -2593,14 +2737,42 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * -arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_t *buf; arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - buf = arc_buf_alloc_impl(hdr, tag); + + buf = arc_buf_alloc_impl(hdr, tag, B_FALSE); + arc_buf_thaw(buf); + + return (buf); +} + +/* + * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this + * for bufs containing metadata. + */ +arc_buf_t * +arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT(compression_type > ZIO_COMPRESS_OFF); + ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + compression_type, ARC_BUFC_DATA); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + buf = arc_buf_alloc_impl(hdr, tag, B_TRUE); arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + return (buf); } @@ -2667,7 +2839,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) - arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pdata != NULL) { arc_hdr_free_pdata(hdr); @@ -2706,16 +2878,10 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } -uint64_t -arc_buf_size(arc_buf_t *buf) -{ - return (HDR_GET_LSIZE(buf->b_hdr)); -} - /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this @@ -2759,7 +2925,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* @@ -2774,7 +2939,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2803,7 +2967,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { @@ -3314,7 +3478,7 @@ arc_adjust_meta_only(void) /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the - * space alloted to the MFU (which is defined as arc_c - arc_p). + * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); @@ -4438,7 +4602,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } @@ -4476,11 +4640,11 @@ static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; - arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = B_FALSE; - + boolean_t freeable = B_FALSE; + arc_buf_t *decomp_buf = NULL; + int callback_cnt = 0; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since @@ -4538,39 +4702,45 @@ arc_read_done(zio_t *zio) arc_access(hdr, hash_lock); } - /* create copies of the data buffer for the callers */ - for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done != NULL) { - /* - * If we're here, then this must be a demand read - * since prefetch requests don't have callbacks. - * If a read request has a callback (i.e. acb_done is - * not NULL), then we decompress the data for the - * first request and clone the rest. This avoids - * having to waste cpu resources decompressing data - * that nobody is explicitly waiting to read. - */ - if (abuf == NULL) { - acb->acb_buf = arc_buf_alloc_impl(hdr, - acb->acb_private); + /* create buffers for the callers. only decompress the data once. */ + for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + if (!acb->acb_done) + continue; + + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ + + callback_cnt++; + if (acb->acb_compressed && !HDR_SHARED_DATA(hdr) && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private, B_TRUE); + } else { + if (decomp_buf == NULL) { + decomp_buf = arc_buf_alloc_impl(hdr, + acb->acb_private, B_FALSE); if (zio->io_error == 0) { zio->io_error = - arc_decompress(acb->acb_buf); + arc_decompress(decomp_buf); } - abuf = acb->acb_buf; + acb->acb_buf = decomp_buf; } else { add_reference(hdr, acb->acb_private); - acb->acb_buf = arc_buf_clone(abuf); + acb->acb_buf = arc_buf_clone(decomp_buf); } } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (abuf == NULL) { - /* - * This buffer didn't have a callback so it must - * be a prefetch. - */ + if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); @@ -4655,6 +4825,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || @@ -4755,19 +4926,43 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* - * If this block is already in use, create a new - * copy of the data so that we will be guaranteed - * that arc_release() will always succeed. + * If we're doing a raw read, the header hasn't been + * shared yet, the header contains compressed data, and + * the data does not need to be byteswapped, use the + * header's b_pdata as the new buf's b_data. Otherwise, + * we'll either need to clone an existing decompressed + * buf or decompress the data ourselves. */ - buf = hdr->b_l1hdr.b_buf; - if (buf == NULL) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - buf = arc_buf_alloc_impl(hdr, private); - VERIFY0(arc_decompress(buf)); + if (compressed_read && !HDR_SHARED_DATA(hdr) && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { + buf = arc_buf_alloc_impl(hdr, private, B_TRUE); } else { - add_reference(hdr, private); - buf = arc_buf_clone(buf); + /* search for a decompressed buf */ + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (!ARC_BUF_COMPRESSED(buf)) + break; + } + + if (buf == NULL) { + /* there could be one compressed buf */ + IMPLY(HDR_SHARED_DATA(hdr), + refcount_count( + &hdr->b_l1hdr.b_refcnt) == 1); + /* otherwise there won't be any */ + IMPLY(!HDR_SHARED_DATA(hdr), + refcount_count( + &hdr->b_l1hdr.b_refcnt) == 0); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, + ==, NULL); + buf = arc_buf_alloc_impl(hdr, private, + B_FALSE); + VERIFY0(arc_decompress(buf)); + } else { + add_reference(hdr, private); + buf = arc_buf_clone(buf); + } } ASSERT3P(buf->b_data, !=, NULL); @@ -4840,6 +5035,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * This is a delicate dance that we play here. @@ -4880,6 +5076,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; + acb->acb_compressed = compressed_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; @@ -5164,7 +5361,7 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); @@ -5188,7 +5385,6 @@ arc_release(arc_buf_t *buf, void *tag) */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; - arc_buf_t **bufp; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); @@ -5200,35 +5396,15 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); + if (arc_buf_is_shared(buf)) ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(buf)); - } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ - bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != NULL) { - if (*bufp == buf) { - *bufp = buf->b_next; - } - - /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. - */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); + lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* @@ -5239,7 +5415,6 @@ arc_release(arc_buf_t *buf, void *tag) */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(lastbuf)); VERIFY(!arc_buf_is_shared(lastbuf)); /* @@ -5249,21 +5424,46 @@ arc_release(arc_buf_t *buf, void *tag) * on the arc_buf_t list. */ arc_unshare_buf(hdr, buf); - arc_share_buf(hdr, lastbuf); + + /* + * If the buf we removed was compressed, then + * we need to allocate a new compressed block for the + * hdr and copy the data over. Otherwise, the + * buffer was uncompressed and we can now share + * the data with the lastbuf. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); + arc_hdr_alloc_pdata(hdr); + bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); + arc_share_buf(hdr, lastbuf); + } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; @@ -5357,15 +5557,13 @@ arc_write_ready(zio_t *zio) /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the - * callback as invoked. + * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pdata != NULL) { if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); - arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5401,19 +5599,27 @@ arc_write_ready(zio_t *zio) * arc thus the on-disk block may or may not match what we maintain * in the hdr's b_pdata field. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); arc_hdr_alloc_pdata(hdr); bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); } else { ASSERT3P(buf->b_data, ==, zio->io_orig_data); - ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(zio->io_orig_size, ==, HDR_GET_PSIZE(hdr)); + } else { + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + } + EQUIV(HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF, + ARC_BUF_COMPRESSED(buf)); /* * This hdr is not compressed so we're able to share @@ -5550,6 +5756,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF); + zio_flags |= ZIO_FLAG_RAW; + } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5570,7 +5780,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5581,7 +5790,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, + zio = zio_write(pio, spa, txg, bp, buf->b_data, + HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 938db4270796..1a765022c891 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -901,7 +901,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(spa, blksz); + abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; @@ -1030,8 +1030,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, - db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, + db->db.db_size)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && @@ -1083,6 +1083,70 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) return (SET_ERROR(err)); } +/* + * This is our just-in-time copy function. It makes a copy of buffers that + * have been modified in a previous transaction group before we access them in + * the current active group. + * + * This function is used in three places: when we are dirtying a buffer for the + * first time in a txg, when we are freeing a range in a dnode that includes + * this buffer, and when we are accessing a buffer which was received compressed + * and later referenced in a WRITE_BYREF record. + * + * Note that when we are called from dbuf_free_range() we do not put a hold on + * the buffer, we just traverse the active dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dnode_t *dn = DB_DNODE(db); + int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + arc_space_consume(bonuslen, ARC_SPACE_BONUS); + bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = arc_buf_size(db->db_buf); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa = db->db_objset->os_spa; + enum zio_compress compress_type = + arc_get_compression(db->db_buf); + + if (compress_type == ZIO_COMPRESS_OFF) { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, + size, arc_buf_lsize(db->db_buf), compress_type); + } + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + db->db_buf = NULL; + dbuf_clear_data(db); + } +} + int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { @@ -1111,6 +1175,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { + /* + * If the arc buf is compressed, we need to decompress it to + * read the data. This could happen during the "zfs receive" of + * a stream which is compressed and deduplicated. + */ + if (db->db_buf != NULL && + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { + dbuf_fix_old_data(db, + spa_syncing_txg(dmu_objset_spa(db->db_objset))); + err = arc_decompress(db->db_buf); + dbuf_set_data(db, db->db_buf); + } mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); @@ -1187,7 +1263,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -1197,62 +1273,6 @@ dbuf_noread(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); } -/* - * This is our just-in-time copy function. It makes a copy of - * buffers, that have been modified in a previous transaction - * group, before we modify them in the current active group. - * - * This function is used in two places: when we are dirtying a - * buffer for the first time in a txg, and when we are freeing - * a range in a dnode that includes this buffer. - * - * Note that when we are called from dbuf_free_range() we do - * not put a hold on the buffer, we just traverse the active - * dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dnode_t *dn = DB_DNODE(db); - int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); - arc_space_consume(bonuslen, ARC_SPACE_BONUS); - bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); - } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - - dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - db->db_buf = NULL; - dbuf_clear_data(db); - } -} - void dbuf_unoverride(dbuf_dirty_record_t *dr) { @@ -1480,7 +1500,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ obuf = db->db_buf; @@ -2045,9 +2065,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); - ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); - ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(arc_buf_lsize(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); @@ -2690,7 +2710,7 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) dbuf_set_data(dh->dh_db, arc_alloc_buf(dh->dh_dn->dn_objset->os_spa, - dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); + dh->dh_db, dh->dh_type, dh->dh_db->db.db_size)); bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, dh->dh_db->db.db_size); } @@ -3321,10 +3341,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ - int blksz = arc_buf_size(*datap); + int psize = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_alloc_buf(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); + enum zio_compress compress_type = arc_get_compression(*datap); + + if (compress_type == ZIO_COMPRESS_OFF) { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); + } else { + int lsize = arc_buf_lsize(*datap); + ASSERT3U(type, ==, ARC_BUFC_DATA); + *datap = arc_alloc_compressed_buf(os->os_spa, db, + psize, lsize, compress_type); + } + bcopy(db->db.db_data, (*datap)->b_data, psize); } db->db_data_pending = dr; @@ -3734,7 +3763,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; - dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + dmu_write_policy(os, dn, db->db_level, wp_flag, + (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ? + arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* @@ -3754,8 +3785,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) void *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, contents, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, + &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, + &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); @@ -3766,7 +3797,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, NULL, db->db.db_size, &zp, + &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 7992e3049760..9bbc5f3a66ed 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1038,7 +1038,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) int i = priv->next++; ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_size(abuf)); + ASSERT(off + n <= arc_buf_lsize(abuf)); iov = (iovec_t *)uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; @@ -1326,7 +1326,7 @@ dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - return (arc_loan_buf(db->db_objset->os_spa, size)); + return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* @@ -1351,7 +1351,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; - uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); @@ -1363,19 +1363,19 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_EXIT(dbuf); /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. It - * can't be metadata because the loaned arc buf comes from the - * user-data kmem area. + * same size as the dbuf, and the dbuf is not metadata. */ - if (offset == db->db.db_offset && blksz == db->db.db_size && - DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { + if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; + /* compressed bufs must always be assignable to their dbuf */ + ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); + ASSERT(!(buf->b_prop_flags & ARC_BUF_FLAG_COMPRESSED)); + DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; @@ -1526,7 +1526,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, - zp, dmu_sync_late_arrival_ready, NULL, + zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); @@ -1579,7 +1579,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, + ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* @@ -1749,7 +1750,8 @@ int zfs_mdcomp_disable = 0; int zfs_redundant_metadata_most_ditto_level = 2; void -dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, + enum zio_compress override_compress, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || @@ -1843,7 +1845,16 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } zp->zp_checksum = checksum; - zp->zp_compress = compress; + + /* + * If we're writing a pre-compressed buffer, the compression type we use + * must match the data. If it hasn't been compressed yet, then we should + * use the value dictated by the policies above. + */ + zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT + ? override_compress : compress; + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index ac98ab6f2165..970ee4f086ca 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -374,9 +374,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_alloc_buf(spa, - sizeof (objset_phys_t), &os->os_phys_buf, - ARC_BUFC_METADATA); + arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, sizeof (objset_phys_t)); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); @@ -389,8 +388,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_alloc_buf(spa, size, - &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } @@ -1175,7 +1174,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); - dmu_write_policy(os, NULL, 0, 0, &zp); + dmu_write_policy(os, NULL, 0, 0, ZIO_COMPRESS_INHERIT, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 07067ff1e44d..b14bcec4b6df 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -278,8 +278,10 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) + uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, + void *data) { + uint64_t payload_size; struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* @@ -290,7 +292,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; - dsp->dsa_last_data_offset = offset + blksz - 1; + dsp->dsa_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either @@ -309,8 +311,26 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; - drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_logical_size = lsize; + + /* only set the compression fields if the buf is compressed */ + if (lsize != psize) { + ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); + ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); + ASSERT3S(psize, >, 0); + ASSERT3S(lsize, >=, psize); + + drrw->drr_compressiontype = BP_GET_COMPRESS(bp); + drrw->drr_compressed_size = psize; + payload_size = drrw->drr_compressed_size; + } else { + payload_size = drrw->drr_logical_size; + } + if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block @@ -329,7 +349,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_key.ddk_cksum = bp->blk_cksum; } - if (dump_record(dsp, data, blksz) != 0) + if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -505,7 +525,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && - !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* @@ -672,20 +692,47 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) arc_buf_t *abuf; int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + /* + * If we have large blocks stored on disk but the send flags + * don't allow us to send large blocks, we split the data from + * the arc buf into chunks. + */ + boolean_t split_large_blocks = + data->datablkszsec > SPA_OLD_MAXBLOCKSIZE && + !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + /* + * We should only request compressed data from the ARC if all + * the following are true: + * - stream compression was requested + * - we aren't splitting large blocks into smaller chunks + * - the data won't need to be byteswapped before sending + * - this isn't an embedded block + * - this isn't metadata (if receiving on a different endian + * system it can be byteswapped more easily) + */ + boolean_t request_compressed = + (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && + !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + if (request_compressed) + zioflags |= ZIO_FLAG_RAW; + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_alloc_buf(spa, blksz, &abuf, - ARC_BUFC_DATA); + abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, + blksz); for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) @@ -697,21 +744,22 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) offset = zb->zb_blkid * blksz; - if (!(dsa->dsa_featureflags & - DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - blksz > SPA_OLD_MAXBLOCKSIZE) { + if (split_large_blocks) { char *buf = abuf->b_data; + ASSERT3U(arc_get_compression(abuf), ==, + ZIO_COMPRESS_OFF); while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsa, type, zb->zb_object, - offset, n, NULL, buf); + offset, n, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsa, type, zb->zb_object, - offset, blksz, bp, abuf->b_data); + err = dump_write(dsa, type, zb->zb_object, offset, + blksz, arc_buf_size(abuf), bp, + abuf->b_data); } arc_buf_destroy(abuf, &abuf); } @@ -738,9 +786,9 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, - boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { objset_t *os; @@ -789,8 +837,14 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; + } + if (compressok) { + featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + } + if ((featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != + 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (resumeobj != 0 || resumeoff != 0) { @@ -935,7 +989,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; @@ -972,10 +1026,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, 0, 0, vp, off); + embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, 0, 0, vp, off); + embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); } dsl_dataset_rele(ds, FTAG); return (err); @@ -983,7 +1037,8 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; @@ -1051,11 +1106,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, + embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, + embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, vp, off); } if (owned) @@ -1066,33 +1121,46 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } static int -dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, - uint64_t *sizep) +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, + uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err; + uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ + uint64_t recordsize; + uint64_t record_count; + + /* Assume all (uncompressed) blocks are recordsize. */ + err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + &recordsize); + if (err != 0) + return (err); + record_count = uncompressed / recordsize; + + /* + * If we're estimating a send size for a compressed stream, use the + * compressed data size to estimate the stream size. Otherwise, use the + * uncompressed data size. + */ + size = stream_compressed ? compressed : uncompressed; + /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). - * Assume all blocks are recordsize. Assume ditto blocks and - * internal fragmentation counter out compression. + * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per - * block, which we observe in practice. + * block. */ - uint64_t recordsize; - err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); - if (err != 0) - return (err); - size -= size / recordsize * sizeof (blkptr_t); + size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ - size += size / recordsize * sizeof (dmu_replay_record_t); + size += record_count * sizeof (dmu_replay_record_t); *sizep = size; @@ -1100,10 +1168,11 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, } int -dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, + boolean_t stream_compressed, uint64_t *sizep) { int err; - uint64_t size; + uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); @@ -1122,33 +1191,41 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); - /* Get uncompressed size estimate of changed data. */ + /* Get compressed and uncompressed size estimates of changed data. */ if (fromds == NULL) { - size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } else { - uint64_t used, comp; + uint64_t used; err = dsl_dataset_space_written(fromds, ds, - &used, &comp, &size); + &used, &comp, &uncomp); if (err != 0) return (err); } - err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, + stream_compressed, sizep); return (err); } +struct calculate_send_arg { + uint64_t uncompressed; + uint64_t compressed; +}; + /* * Simple callback used to traverse the blocks of a snapshot and sum their - * uncompressed size + * uncompressed and compressed sizes. */ /* ARGSUSED */ static int dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - uint64_t *spaceptr = arg; + struct calculate_send_arg *space = arg; if (bp != NULL && !BP_IS_HOLE(bp)) { - *spaceptr += BP_GET_UCSIZE(bp); + space->uncompressed += BP_GET_UCSIZE(bp); + space->compressed += BP_GET_PSIZE(bp); } return (0); } @@ -1160,10 +1237,10 @@ dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ int dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, - uint64_t *sizep) + boolean_t stream_compressed, uint64_t *sizep) { int err; - uint64_t size = 0; + struct calculate_send_arg size = { 0 }; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); @@ -1181,10 +1258,12 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, */ err = traverse_dataset(ds, from_txg, TRAVERSE_POST, dmu_calculate_send_traversal, &size); + if (err) return (err); - err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, + size.compressed, stream_compressed, sizep); return (err); } @@ -1315,14 +1394,14 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plan WRITE record, so the pool must have the + * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); @@ -1501,11 +1580,21 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, + 8, 1, &one, tx)); + } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_COMPRESSED) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, + 8, 1, &one, tx)); + } } dmu_buf_will_dirty(newds->ds_dbuf, tx); @@ -1563,7 +1652,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); @@ -1892,10 +1981,11 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); - DO64(drr_write.drr_length); + DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); @@ -2137,7 +2227,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, dmu_buf_t *bonus; int err; - if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); @@ -2159,7 +2249,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_length); + drrw->drr_offset, drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); @@ -2169,9 +2259,10 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - drrw->drr_length); + DRR_WRITE_PAYLOAD_SIZE(drrw)); } + /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); @@ -2587,18 +2678,31 @@ receive_read_record(struct receive_arg *ra) case DRR_WRITE: { struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), - drrw->drr_length); + arc_buf_t *abuf; + boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); + if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + ASSERT(!is_meta); + abuf = arc_loan_compressed_buf( + dmu_objset_spa(ra->os), + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(ra->os), + is_meta, drrw->drr_logical_size); + } err = receive_read_payload_and_next_header(ra, - drrw->drr_length, abuf->b_data); + DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); if (err != 0) { dmu_return_arcbuf(abuf); return (err); } ra->rrd->write_buf = abuf; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, - drrw->drr_length); + drrw->drr_logical_size); return (err); } case DRR_WRITE_BYREF: diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index dd390d49ab73..0f0783b7d8b9 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1759,10 +1759,18 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { fnvlist_add_string(token_nv, "toname", buf); } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_LARGEBLOCK) == 0) { + fnvlist_add_boolean(token_nv, "largeblockok"); + } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_EMBEDOK) == 0) { fnvlist_add_boolean(token_nv, "embedok"); } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_COMPRESSOK) == 0) { + fnvlist_add_boolean(token_nv, "compressok"); + } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 8e187d59ce99..14de14826f9e 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4461,6 +4461,7 @@ zfs_ioc_send(zfs_cmd_t *zc) boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); + boolean_t compressok = (zc->zc_flags & 0x4); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4508,7 +4509,7 @@ zfs_ioc_send(zfs_cmd_t *zc) } } - error = dmu_send_estimate(tosnap, fromsnap, + error = dmu_send_estimate(tosnap, fromsnap, compressok, &zc->zc_objset_type); if (fromsnap != NULL) @@ -4522,7 +4523,7 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, large_block_ok, + zc->zc_fromobj, embedok, large_block_ok, compressok, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) @@ -5415,6 +5416,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } @@ -5432,6 +5435,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) file_t *fp; boolean_t largeblockok; boolean_t embedok; + boolean_t compressok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; @@ -5443,6 +5447,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); @@ -5451,8 +5456,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, fd, - resumeobj, resumeoff, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, + fd, resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5468,6 +5473,12 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted * } * * outnvl: { @@ -5481,6 +5492,11 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) dsl_dataset_t *tosnap; int error; char *fromname; + /* LINTED E_FUNC_SET_NOT_USED */ + boolean_t largeblockok; + /* LINTED E_FUNC_SET_NOT_USED */ + boolean_t embedok; + boolean_t compressok; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); @@ -5493,6 +5509,10 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); + error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { if (strchr(fromname, '@') != NULL) { @@ -5505,7 +5525,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; - error = dmu_send_estimate(tosnap, fromsnap, &space); + error = dmu_send_estimate(tosnap, fromsnap, compressok, + &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* @@ -5520,7 +5541,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, &space); + frombm.zbm_creation_txg, compressok, &space); } else { /* * from is not properly formatted as a snapshot or @@ -5531,7 +5552,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) } } else { // If estimating the size of a full send, use dmu_send_estimate - error = dmu_send_estimate(tosnap, NULL, &space); + error = dmu_send_estimate(tosnap, NULL, compressok, &space); } fnvlist_add_uint64(outnvl, "space", space); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 545a43d81424..892b86fbaf43 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -529,21 +529,24 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_type_t type, zio_priority_t priority, enum zio_flag flags, - vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, - enum zio_stage stage, enum zio_stage pipeline) + void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + void *private, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, vdev_t *vd, uint64_t offset, + const zbookmark_phys_t *zb, enum zio_stage stage, + enum zio_stage pipeline) { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); + zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); @@ -586,7 +589,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_data = zio->io_data = data; - zio->io_orig_size = zio->io_size = size; + zio->io_orig_size = zio->io_size = psize; + zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; @@ -626,7 +630,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, { zio_t *zio; - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); @@ -735,7 +739,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, - data, size, done, private, + data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); @@ -745,7 +749,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -762,7 +766,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); - zio = zio_create(pio, spa, txg, bp, data, size, done, private, + zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); @@ -792,7 +796,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, { zio_t *zio; - zio = zio_create(pio, spa, txg, bp, data, size, done, private, + zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); @@ -872,8 +876,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, stage |= ZIO_STAGE_ISSUE_ASYNC; zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, - NULL, 0, NULL, ZIO_STAGE_OPEN, stage); + BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } @@ -906,8 +910,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, - NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); return (zio); } @@ -920,7 +924,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); @@ -948,9 +952,9 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, - NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -969,9 +973,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, - NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -1027,7 +1031,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); @@ -1048,7 +1052,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, - data, size, done, private, type, priority, + data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); @@ -1077,8 +1081,11 @@ zio_shrink(zio_t *zio, uint64_t size) * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - if (!BP_IS_RAIDZ(zio->io_bp)) - zio->io_orig_size = zio->io_size = size; + if (!BP_IS_RAIDZ(zio->io_bp)) { + /* we are not doing a raw write */ + ASSERT3U(zio->io_size, ==, zio->io_lsize); + zio->io_orig_size = zio->io_size = zio->io_lsize = size; + } } /* @@ -1128,10 +1135,12 @@ zio_write_bp_init(zio_t *zio) zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_size; - uint64_t psize = lsize; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; int pass = 1; + EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); + /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. @@ -1217,7 +1226,8 @@ zio_write_bp_init(zio_t *zio) spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } - if (compress != ZIO_COMPRESS_OFF) { + /* If it's a compressed write that is not raw, compress the buffer. */ + if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); if (psize == 0 || psize == lsize) { @@ -1263,6 +1273,9 @@ zio_write_bp_init(zio_t *zio) psize, lsize, NULL); } } + } else { + ASSERT3U(psize, !=, 0); + } /* @@ -2163,8 +2176,8 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, NULL, + (char *)pio->io_data + (pio->io_size - resid), lsize, + lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } @@ -2351,6 +2364,8 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) spa_t *spa = zio->io_spa; int p; + ASSERT0(zio->io_flags & ZIO_FLAG_RAW); + /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have @@ -2496,6 +2511,7 @@ zio_ddt_write(zio_t *zio) ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + ASSERT0(zio->io_flags & ZIO_FLAG_RAW); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); @@ -2548,7 +2564,7 @@ zio_ddt_write(zio_t *zio) } dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, - zio->io_orig_size, &czp, NULL, NULL, + zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); @@ -2570,7 +2586,7 @@ zio_ddt_write(zio_t *zio) ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, - zio->io_orig_size, zp, + zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); From c0b2e4a416c00fdff7535380e6ba44885d937f12 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Sun, 12 Jun 2016 22:47:35 -0400 Subject: [PATCH 03/11] Remove lint supression from dmu.h and unnecessary dmu.h include in spa.h --- include/sys/dmu.h | 8 +------- include/sys/spa.h | 3 +-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 83919a624c5a..4efab7c723cd 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -562,12 +562,7 @@ typedef struct dmu_buf_user { * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ -#ifdef __lint -/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ -extern void -dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, - dmu_buf_t **clear_on_evict_dbufp); -#else /* __lint */ +/*ARGSUSED*/ static inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp) @@ -580,7 +575,6 @@ dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } -#endif /* __lint */ /* * Attach user data to a dbuf and mark it for normal (when the dbuf's diff --git a/include/sys/spa.h b/include/sys/spa.h index 738b4db30f63..369ee8ef9329 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -558,8 +559,6 @@ _NOTE(CONSTCOND) } while (0) ASSERT(len < size); \ } -#include - #define BP_GET_BUFC_TYPE(bp) \ (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA) From 6066a66c41d08af55cf35ca3a090c5d51d785cac Mon Sep 17 00:00:00 2001 From: David Quigley Date: Wed, 13 Jul 2016 17:17:41 -0400 Subject: [PATCH 04/11] DLPX-44733 combine arc_buf_alloc_impl() with arc_buf_clone() --- include/sys/arc.h | 2 +- module/zfs/arc.c | 535 ++++++++++++++++++++++++++-------------------- module/zfs/dmu.c | 2 +- 3 files changed, 306 insertions(+), 233 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index 97529c3fc3ae..e1422d2e1051 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -152,7 +152,7 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_buf_flags_t b_prop_flags; + arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 959d3fe88034..7d8b0dca3bc4 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -851,8 +851,8 @@ static taskq_t *arc_prune_taskq; HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) -#define ARC_BUF_SHARED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_SHARED) -#define ARC_BUF_COMPRESSED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_COMPRESSED) +#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes @@ -1322,6 +1322,12 @@ arc_buf_is_shared(arc_buf_t *buf) IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); + + /* + * It would be nice to assert arc_can_share() too, but the "hdr isn't + * already being shared" requirement prevents us from doing that. + */ + return (shared); } @@ -1337,6 +1343,11 @@ arc_cksum_free(arc_buf_hdr_t *hdr) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } +/* + * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data + * matches the checksum that is stored in the hdr. If there is no checksum, + * or if the buf is compressed, this is a no-op. + */ static void arc_cksum_verify(arc_buf_t *buf) { @@ -1346,6 +1357,12 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + hdr->b_l1hdr.b_bufcnt > 1); + return; + } + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); @@ -1430,6 +1447,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) return (valid_cksum); } +/* + * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a + * checksum and attaches it to the buf's hdr so that we can ensure that the buf + * isn't modified later on. If buf is compressed or there is already a checksum + * on the hdr, this is a no-op (we only checksum uncompressed bufs). + */ static void arc_cksum_compute(arc_buf_t *buf) { @@ -1446,7 +1469,14 @@ arc_cksum_compute(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + /* + * Since the checksum doesn't apply to compressed buffers, we + * only keep a checksum if there are uncompressed buffers. + * Therefore there must be another buffer, which is + * uncompressed. + */ + IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL, + hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } @@ -1534,9 +1564,7 @@ arc_buf_thaw(arc_buf_t *buf) ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - if (zfs_flags & ZFS_DEBUG_MODIFY) { - arc_cksum_verify(buf); - } + arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum or @@ -1615,8 +1643,6 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the - * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. - * Holes and embedded blocks remain anonymous so we don't * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { @@ -1632,65 +1658,158 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) } } -int -arc_decompress(arc_buf_t *buf) +/* + * Looks for another buf on the same hdr which has the data decompressed, copies + * from it, and returns true. If no such buf exists, returns false. + */ +static boolean_t +arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; + arc_buf_t *from; + boolean_t copied = B_FALSE; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(!ARC_BUF_COMPRESSED(buf)); + + for (from = hdr->b_l1hdr.b_buf; from != NULL; + from = from->b_next) { + /* can't use our own data buffer */ + if (from == buf) { + continue; + } + + if (!ARC_BUF_COMPRESSED(from)) { + bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); + copied = B_TRUE; + break; + } + } + + /* + * There were no decompressed bufs, so there should not be a + * checksum on the hdr either. + */ + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + + return (copied); +} + +/* + * Given a buf that has a data buffer attached to it, this function will + * efficiently fill the buf with data of the specified compression setting from + * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr + * are already sharing a data buf, no copy is performed. + * + * If the buf is marked as compressed but uncompressed data was requested, this + * will allocate a new data buffer for the buf, remove that flag, and fill the + * buf with uncompressed data. You can't request a compressed buf on a hdr with + * uncompressed data, and (since we haven't added support for it yet) if you + * want compressed data your buf must already be marked as compressed and have + * the correct-sized data buffer. + */ +static int +arc_buf_fill(arc_buf_t *buf, boolean_t compressed) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; - int error; - if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { - /* - * The arc_buf_hdr_t is either not compressed or is - * associated with an embedded block or a hole in which - * case they remain anonymous. - */ - IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || - HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT3P(buf->b_data, !=, NULL); + IMPLY(compressed, hdr_compressed); + IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + + if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, - HDR_GET_LSIZE(hdr)); + arc_buf_size(buf)); } } else { + ASSERT(hdr_compressed); + ASSERT(!compressed); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* - * If the buf is compressed and sharing data with hdr, unlink - * its data buf from the header and make it uncompressed. + * If the buf is sharing its data with the hdr, unlink it and + * allocate a new data buffer for the buf. */ - if (ARC_BUF_COMPRESSED(buf)) { - buf->b_prop_flags &= - ~(ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED); + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + + /* We need to give the buf it's own b_data */ + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - /* - * Previously this buf was shared so overhead was 0, so - * just add new overhead. - */ + /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } else if (ARC_BUF_COMPRESSED(buf)) { + /* We need to reallocate the buf's b_data */ + arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), + buf); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + + /* We increased the size of b_data; update overhead */ + ARCSTAT_INCR(arcstat_overhead_size, + HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } - error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); - if (error != 0) { - zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); - return (SET_ERROR(EIO)); + /* + * Regardless of the buf's previous compression settings, it + * should not be compressed at the end of this function. + */ + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + + /* + * Try copying the data from another buf which already has a + * decompressed version. If that's not possible, it's time to + * bite the bullet and decompress the data from the hdr. + */ + if (arc_buf_try_copy_decompressed_data(buf)) { + /* Skip byteswapping and checksumming (already done) */ + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); + return (0); + } else { + int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + + /* + * Absent hardware errors or software bugs, this should + * be impossible, but log it anyway so we can debug it. + */ + if (error != 0) { + zfs_dbgmsg( + "hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } } } + + /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } + + /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); + return (0); } +int +arc_decompress(arc_buf_t *buf) +{ + return (arc_buf_fill(buf, B_FALSE)); +} + /* * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. */ @@ -1739,7 +1858,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; - ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); (void) refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } @@ -1775,7 +1893,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; - ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); (void) refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } @@ -2010,8 +2127,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (arc_buf_is_shared(buf)) continue; - ASSERT3U(HDR_GET_LSIZE(hdr), ==, - arc_buf_size(buf)); (void) refcount_add_many(&new_state->arcs_size, arc_buf_size(buf), buf); } @@ -2066,8 +2181,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (arc_buf_is_shared(buf)) continue; - ASSERT3U(HDR_GET_LSIZE(hdr), ==, - arc_buf_size(buf)); (void) refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); @@ -2170,18 +2283,61 @@ arc_space_return(uint64_t space, arc_space_type_t type) } /* - * Allocate either the first buffer for this hdr, or a compressed buffer for - * this hdr. Subsequent non-compressed buffers use arc_buf_clone(). + * Given a hdr and a buf, returns whether that buf can share its b_data buffer + * with the hdr's b_pdata. */ -static arc_buf_t * -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed) +static boolean_t +arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + boolean_t hdr_compressed, buf_compressed; + /* + * The criteria for sharing a hdr's data are: + * 1. the hdr's compression matches the buf's compression + * 2. the hdr doesn't need to be byteswapped + * 3. the hdr isn't already being shared + * 4. the buf is either compressed or it is the last buf in the hdr list + * + * Criterion #4 maintains the invariant that shared uncompressed + * bufs must be the final buf in the hdr's b_buf list. Reading this, you + * might ask, "if a compressed buf is allocated first, won't that be the + * last thing in the list?", but in that case it's impossible to create + * a shared uncompressed buf anyway (because the hdr must be compressed + * to have the compressed buf). You might also think that #3 is + * sufficient to make this guarantee, however it's possible + * (specifically in the rare L2ARC write race mentioned in + * arc_buf_alloc_impl()) there will be an existing uncompressed buf that + * is sharable, but wasn't at the time of its allocation. Rather than + * allow a new shared uncompressed buf to be created and then shuffle + * the list around to make it the last element, this simply disallows + * sharing if the new buf isn't the first to be added. + */ + ASSERT3P(buf->b_hdr, ==, hdr); + hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; + buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; + return (buf_compressed == hdr_compressed && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !HDR_SHARED_DATA(hdr) && + (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); +} + +/* + * Allocate a buf for this hdr. If you care about the data that's in the hdr, + * or if you want a compressed buffer, pass those flags in. Returns 0 if the + * copy was made successfully, or an error code otherwise. + */ +static int +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, + boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; + boolean_t can_share; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); + ASSERT3P(ret, !=, NULL); + ASSERT3P(*ret, ==, NULL); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; @@ -2189,10 +2345,11 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed) hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; + buf->b_flags = 0; add_reference(hdr, tag); @@ -2203,67 +2360,57 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed) ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * If the hdr's data can be shared (no byteswapping, hdr compression - * matches the requested buf compression) then we share the data buffer - * and set the appropriate bit in the hdr's b_flags to indicate - * the hdr is sharing it's b_pdata with the arc_buf_t. Otherwise, we + * Only honor requests for compressed bufs if the hdr is actually + * compressed. + */ + if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + + /* + * Although the ARC should handle it correctly, levels above the ARC + * should prevent us from having multiple compressed bufs off the same + * hdr. To ensure we notice it if this behavior changes, we assert this + * here the best we can. + */ + IMPLY(ARC_BUF_COMPRESSED(buf), !HDR_SHARED_DATA(hdr)); + + /* + * If the hdr's data can be shared then we share the data buffer and + * set the appropriate bit in the hdr's b_flags to indicate the hdr is * allocate a new buffer to store the buf's data. + * + * There is one additional restriction here because we're sharing + * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively + * involved in an L2ARC write, because if this buf is used by an + * arc_write() then the hdr's data buffer will be released when the + * write completes, even though the L2ARC write might still be using it. */ - if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && compressed && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT(!HDR_SHARED_DATA(hdr)); - buf->b_data = hdr->b_l1hdr.b_pdata; - buf->b_prop_flags = - ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED; - arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - } else if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - !compressed && HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(ARC_BUF_LAST(buf)); + can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + + /* Set up b_data and sharing */ + if (can_share) { buf->b_data = hdr->b_l1hdr.b_pdata; - buf->b_prop_flags = ARC_BUF_FLAG_SHARED; + buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - ASSERT(!compressed); - buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - buf->b_prop_flags = 0; - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_data = + arc_get_data_buf(hdr, arc_buf_size(buf), buf); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; - return (buf); -} - -/* - * Used when allocating additional buffers. - */ -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = HDR_GET_LSIZE(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!ARC_BUF_COMPRESSED(from)); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_prop_flags = 0; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - bcopy(from->b_data, buf->b_data, size); - hdr->b_l1hdr.b_bufcnt += 1; + /* + * If the user wants the data from the hdr, we need to either copy or + * decompress the data. + */ + if (fill) { + return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); + } - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - return (buf); + return (0); } static char *arc_onloan_tag = "onloan"; @@ -2367,8 +2514,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); + ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); @@ -2380,7 +2526,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pdata = buf->b_data; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - buf->b_prop_flags |= ARC_BUF_FLAG_SHARED; + buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need @@ -2395,7 +2541,6 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - ASSERT(HDR_SHARED_DATA(hdr)); ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); @@ -2407,7 +2552,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); hdr->b_l1hdr.b_pdata = NULL; - buf->b_prop_flags &= ~ARC_BUF_FLAG_SHARED; + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between @@ -2470,10 +2615,9 @@ arc_buf_destroy_impl(arc_buf_t *buf) arc_buf_hdr_t *hdr = buf->b_hdr; /* - * Free up the data associated with the buf but only - * if we're not sharing this with the hdr. If we are sharing - * it with the hdr, then hdr will have performed the allocation - * so allow it to do the free. + * Free up the data associated with the buf but only if we're not + * sharing this with the hdr. If we are sharing it with the hdr, the + * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* @@ -2482,9 +2626,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - if (!ARC_BUF_COMPRESSED(buf)) { - arc_cksum_verify(buf); - } + arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { @@ -2502,27 +2644,23 @@ arc_buf_destroy_impl(arc_buf_t *buf) lastbuf = arc_buf_remove(hdr, buf); - if (ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* - * For compressed, shared buffers we don't need to do anything - * special so take the opportunity to ensure that compressed - * buffers must be shared. The hdr has already been marked as - * not shared and we already cleared b_data, so just check the - * flag on the buf. - */ - VERIFY(ARC_BUF_SHARED(buf)); - } else if (ARC_BUF_SHARED(buf)) { - ASSERT(!ARC_BUF_COMPRESSED(buf)); - - /* - * If the current arc_buf_t is sharing its data - * buffer with the hdr, then reassign the hdr's - * b_pdata to share it with the new buffer at the end - * of the list. The shared buffer is always the last one - * on the hdr's buffer list. + * If the current arc_buf_t is sharing its data buffer with the + * hdr, then reassign the hdr's b_pdata to share it with the new + * buffer at the end of the list. The shared buffer is always + * the last one on the hdr's buffer list. + * + * There is an equivalent case for compressed bufs, but since + * they aren't guaranteed to be the last buf in the list and + * that is an exceedingly rare case, we just allow that space be + * wasted temporarily. */ if (lastbuf != NULL) { + /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); + /* hdr is uncompressed so can't have compressed buf */ + VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); arc_hdr_free_pdata(hdr); @@ -2744,7 +2882,8 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - buf = arc_buf_alloc_impl(hdr, tag, B_FALSE); + buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); @@ -2769,7 +2908,8 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, compression_type, ARC_BUFC_DATA); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - buf = arc_buf_alloc_impl(hdr, tag, B_TRUE); + buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); @@ -4641,9 +4781,10 @@ arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; - arc_callback_t *callback_list, *acb; + arc_callback_t *callback_list; + arc_callback_t *acb; boolean_t freeable = B_FALSE; - arc_buf_t *decomp_buf = NULL; + boolean_t no_zio_error = (zio->io_error == 0); int callback_cnt = 0; /* * The hdr was inserted into hash-table and removed from lists @@ -4670,7 +4811,7 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } - if (zio->io_error == 0) { + if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -4691,8 +4832,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -4702,40 +4842,25 @@ arc_read_done(zio_t *zio) arc_access(hdr, hash_lock); } - /* create buffers for the callers. only decompress the data once. */ + /* + * If a read request has a callback (i.e. acb_done is not NULL), then we + * make a buf containing the data according to the parameters which were + * passed in. The implementation of arc_buf_alloc_impl() ensures that we + * aren't needlessly decompressing the data multiple times. + */ for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + int error; if (!acb->acb_done) continue; - /* - * If we're here, then this must be a demand read - * since prefetch requests don't have callbacks. - * If a read request has a callback (i.e. acb_done is - * not NULL), then we decompress the data for the - * first request and clone the rest. This avoids - * having to waste cpu resources decompressing data - * that nobody is explicitly waiting to read. - */ + /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; - if (acb->acb_compressed && !HDR_SHARED_DATA(hdr) && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { - acb->acb_buf = arc_buf_alloc_impl(hdr, - acb->acb_private, B_TRUE); - } else { - if (decomp_buf == NULL) { - decomp_buf = arc_buf_alloc_impl(hdr, - acb->acb_private, B_FALSE); - if (zio->io_error == 0) { - zio->io_error = - arc_decompress(decomp_buf); - } - acb->acb_buf = decomp_buf; - } else { - add_reference(hdr, acb->acb_private); - acb->acb_buf = arc_buf_clone(decomp_buf); - } + + error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, no_zio_error, &acb->acb_buf); + if (no_zio_error) { + zio->io_error = error; } } hdr->b_l1hdr.b_acb = NULL; @@ -4749,7 +4874,7 @@ arc_read_done(zio_t *zio) ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error == 0) { + if (no_zio_error) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -4925,47 +5050,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); - /* - * If we're doing a raw read, the header hasn't been - * shared yet, the header contains compressed data, and - * the data does not need to be byteswapped, use the - * header's b_pdata as the new buf's b_data. Otherwise, - * we'll either need to clone an existing decompressed - * buf or decompress the data ourselves. - */ - if (compressed_read && !HDR_SHARED_DATA(hdr) && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { - buf = arc_buf_alloc_impl(hdr, private, B_TRUE); - } else { - /* search for a decompressed buf */ - for (buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - if (!ARC_BUF_COMPRESSED(buf)) - break; - } - - if (buf == NULL) { - /* there could be one compressed buf */ - IMPLY(HDR_SHARED_DATA(hdr), - refcount_count( - &hdr->b_l1hdr.b_refcnt) == 1); - /* otherwise there won't be any */ - IMPLY(!HDR_SHARED_DATA(hdr), - refcount_count( - &hdr->b_l1hdr.b_refcnt) == 0); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, - ==, NULL); - buf = arc_buf_alloc_impl(hdr, private, - B_FALSE); - VERIFY0(arc_decompress(buf)); - } else { - add_reference(hdr, private); - buf = arc_buf_clone(buf); - } - } - ASSERT3P(buf->b_data, !=, NULL); - + /* Get a buf with the desired data in it. */ + VERIFY0(arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf)); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); @@ -5396,8 +5483,10 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (arc_buf_is_shared(buf)) + if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } /* * Pull the data off of this hdr and attach it to @@ -5409,9 +5498,7 @@ arc_release(arc_buf_t *buf, void *tag) /* * If the current arc_buf_t and the hdr are sharing their data - * buffer, then we must stop sharing that block, transfer - * ownership and setup sharing with a new arc_buf_t at the end - * of the hdr's b_buf list. + * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -5426,19 +5513,16 @@ arc_release(arc_buf_t *buf, void *tag) arc_unshare_buf(hdr, buf); /* - * If the buf we removed was compressed, then - * we need to allocate a new compressed block for the - * hdr and copy the data over. Otherwise, the - * buffer was uncompressed and we can now share - * the data with the lastbuf. + * Now we need to recreate the hdr's b_pdata. Since we + * have lastbuf handy, we try to share with it, but if + * we can't then we allocate a new b_pdata and copy the + * data from buf into it. */ - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); + if (arc_can_share(hdr, lastbuf)) { + arc_share_buf(hdr, lastbuf); + } else { arc_hdr_alloc_pdata(hdr); bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); - } else { - ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); - arc_share_buf(hdr, lastbuf); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { @@ -5601,32 +5685,21 @@ arc_write_ready(zio_t *zio) */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { - ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); arc_hdr_alloc_pdata(hdr); bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); } else { ASSERT3P(buf->b_data, ==, zio->io_orig_data); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(zio->io_orig_size, ==, HDR_GET_PSIZE(hdr)); - } else { - ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); - } - EQUIV(HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF, - ARC_BUF_COMPRESSED(buf)); /* * This hdr is not compressed so we're able to share * the arc_buf_t data buffer with the hdr. */ arc_share_buf(hdr, buf); - VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, HDR_GET_LSIZE(hdr))); } arc_hdr_verify(hdr, zio->io_bp); @@ -5685,7 +5758,7 @@ arc_write_done(zio_t *zio) arc_buf_hdr_t *exists; kmutex_t *hash_lock; - ASSERT(zio->io_error == 0); + ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9bbc5f3a66ed..21b5534df790 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1374,7 +1374,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); - ASSERT(!(buf->b_prop_flags & ARC_BUF_FLAG_COMPRESSED)); + ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); From 638d8c9cf3ef8f2ef27363c7460421e9d6ffeba2 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 22 Jul 2016 11:52:49 -0400 Subject: [PATCH 05/11] DLPX-44812 integrate EP-220 large memory scalability --- cmd/raidz_test/raidz_bench.c | 3 + cmd/raidz_test/raidz_test.c | 11 + cmd/zdb/zdb.c | 48 +- cmd/zdb/zdb_il.c | 59 +- cmd/ztest/ztest.c | 18 +- include/sys/Makefile.am | 1 + include/sys/abd.h | 160 +++++ include/sys/arc_impl.h | 2 +- include/sys/ddt.h | 5 +- include/sys/spa.h | 11 +- include/sys/vdev_impl.h | 3 +- include/sys/vdev_raidz_impl.h | 5 +- include/sys/zio.h | 30 +- include/sys/zio_checksum.h | 20 +- include/sys/zio_compress.h | 28 +- include/zfs_fletcher.h | 3 + lib/libzpool/Makefile.am | 1 + module/zcommon/zfs_fletcher.c | 54 +- module/zfs/Makefile.in | 1 + module/zfs/abd.c | 986 ++++++++++++++++++++++++++++ module/zfs/arc.c | 381 ++++++----- module/zfs/blkptr.c | 2 +- module/zfs/dbuf.c | 7 +- module/zfs/ddt.c | 12 +- module/zfs/dmu.c | 14 +- module/zfs/dmu_send.c | 14 +- module/zfs/dsl_scan.c | 12 +- module/zfs/sha256.c | 46 ++ module/zfs/spa.c | 8 +- module/zfs/vdev.c | 11 +- module/zfs/vdev_cache.c | 37 +- module/zfs/vdev_disk.c | 43 +- module/zfs/vdev_file.c | 17 +- module/zfs/vdev_label.c | 76 ++- module/zfs/vdev_mirror.c | 14 +- module/zfs/vdev_queue.c | 24 +- module/zfs/vdev_raidz.c | 599 +++++++++++------ module/zfs/vdev_raidz_math.c | 28 +- module/zfs/vdev_raidz_math_avx2.c | 6 +- module/zfs/vdev_raidz_math_impl.h | 9 +- module/zfs/vdev_raidz_math_scalar.c | 1 - module/zfs/vdev_raidz_math_ssse3.c | 5 + module/zfs/zil.c | 6 +- module/zfs/zio.c | 269 +++++--- module/zfs/zio_checksum.c | 144 +++- module/zfs/zio_compress.c | 80 ++- 46 files changed, 2596 insertions(+), 718 deletions(-) create mode 100644 include/sys/abd.h create mode 100644 module/zfs/abd.c diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index f1710ccc7c4b..7ddb7bed0182 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -23,6 +23,8 @@ * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ +#ifdef _ABD_READY_ + #include #include #include @@ -225,3 +227,4 @@ run_raidz_benchmark(void) bench_fini_raidz_maps(); } +#endif diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index e592cf7e8107..916d796db09d 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -32,6 +32,16 @@ #include #include #include + +#ifndef _ABD_READY_ +int +main(int argc, char **argv) +{ + exit(0); +} + +#else + #include "raidz_test.h" static int *rand_data; @@ -768,3 +778,4 @@ main(int argc, char **argv) return (err); } +#endif diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 611e9229461c..805580008f60 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -2462,7 +2463,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2528,7 +2529,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -2541,7 +2542,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(NULL, spa, bp, data, size, + zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } @@ -3319,6 +3320,13 @@ zdb_vdev_lookup(vdev_t *vdev, char *path) return (NULL); } +/* ARGSUSED */ +static int +random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) +{ + return (random_get_pseudo_bytes(buf, len)); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -3350,7 +3358,8 @@ zdb_read_block(char *thing, spa_t *spa) uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *pbuf, *lbuf, *buf; + abd_t *pabd; + void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3423,8 +3432,7 @@ zdb_read_block(char *thing, spa_t *spa) psize = size; lsize = size; - /* Some 4K native devices require 4K buffer alignment */ - pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL); + pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3452,15 +3460,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3483,13 +3491,13 @@ zdb_read_block(char *thing, spa_t *spa) void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - bcopy(pbuf, pbuf2, psize); + abd_copy_to_buf(pbuf2, pabd, psize); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, + random_get_pseudo_bytes_cb, NULL)); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize)); /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, @@ -3504,10 +3512,10 @@ zdb_read_block(char *thing, spa_t *spa) "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); - if (zio_decompress_data(c, pbuf, lbuf, - psize, lsize) == 0 && - zio_decompress_data(c, pbuf2, lbuf2, - psize, lsize) == 0 && + if (zio_decompress_data(c, pabd, + lbuf, psize, lsize) == 0 && + zio_decompress_data_buf(c, pbuf2, + lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } @@ -3525,7 +3533,7 @@ zdb_read_block(char *thing, spa_t *spa) buf = lbuf; size = lsize; } else { - buf = pbuf; + buf = abd_to_buf(pabd); size = psize; } @@ -3543,7 +3551,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: - umem_free(pbuf, SPA_MAXBLOCKSIZE); + abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 1501e879d207..190bfee86ae5 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -42,6 +42,7 @@ #include #include #include +#include extern uint8_t dump_opt[256]; @@ -119,14 +120,30 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm); } +/* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + int i; + + for (i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + /* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { - char *data, *dlimit; + abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; - char *buf; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -137,9 +154,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (txtype == TX_WRITE2 || verbose < 5) return; - if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL) - return; - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, !BP_IS_HOLE(bp) && @@ -150,43 +164,38 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - bzero(buf, SPA_MAXBLOCKSIZE); (void) printf("%s\n", prefix); - goto exit; + return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s\n", prefix); - goto exit; + return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) - goto exit; - data = buf; + goto out; } else { - data = (char *)(lr + 1); + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); } - dlimit = data + MIN(lr->lr_length, - (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("%s", prefix); - while (data < dlimit) { - if (isprint(*data)) - (void) printf("%c ", *data); - else - (void) printf("%2hhX", *data); - data++; - } + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); (void) printf("\n"); -exit: - free(buf); + +out: + abd_free(data); } /* ARGSUSED */ diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 1b77b6ceed97..a68dbcb61ff2 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -114,6 +114,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5442,7 +5444,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5523,14 +5525,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); - ztest_pattern_set(buf, psize, ~pattern); + abd = abd_alloc_linear(psize, B_TRUE); + ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(abd); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); @@ -5887,6 +5889,12 @@ ztest_resume_thread(void *arg) */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 40cd0597c9b9..d88e5cdca87b 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = fm fs crypto sysevent COMMON_H = \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000000..ea3f7e2fb222 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */ +} abd_flags_t; + +typedef struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; + uint_t abd_chunk_size; + void *abd_chunks[]; + } abd_scatter; + struct abd_linear { + void *abd_buf; + } abd_linear; + } abd_u; +} abd_t; + +typedef int abd_iter_func_t(void *buf, size_t len, void *private); +typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); + +extern int zfs_abd_scatter_enabled; + +static inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0); +} + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_for_io_nosleep(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_free(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); + +/* + * Wrappers for calls with offsets of 0 + */ + +static inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +static inline void +abd_copy_from_buf(abd_t *abd, void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +static inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +static inline int +abd_cmp_buf(abd_t *abd, void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +static inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * Module lifecycle + */ + +void abd_init(void); +void abd_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index d2dc527feb4e..f5b7cb42a6fb 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -166,7 +166,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - void *b_pdata; + abd_t *b_pabd; } l1arc_buf_hdr_t; typedef struct l2arc_dev { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 3befcb84427c..667795f967f7 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DDT_H @@ -35,6 +36,8 @@ extern "C" { #endif +struct abd; + /* * On-disk DDT formats, in the desired search order (newest version first). */ @@ -108,7 +111,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + struct abd *dde_repair_abd; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff --git a/include/sys/spa.h b/include/sys/spa.h index 369ee8ef9329..a5b82c44d19f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -407,15 +407,17 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) -#define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ @@ -560,8 +562,7 @@ _NOTE(CONSTCOND) } while (0) } #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA) + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 0d09c81c7f83..14e5bb7facf0 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -52,6 +52,7 @@ extern "C" { typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; /* * Virtual device operations @@ -83,7 +84,7 @@ typedef const struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + struct abd *ve_abd; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index af09aa643897..92e48e227f28 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -104,7 +105,7 @@ typedef struct raidz_col { size_t rc_devidx; /* child device index for I/O */ size_t rc_offset; /* device offset */ size_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_abd; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ unsigned int rc_tried; /* Did we attempt this I/O column? */ @@ -121,7 +122,7 @@ typedef struct raidz_map { size_t rm_firstdatacol; /* First data column/parity count */ size_t rm_nskip; /* Skipped sectors for padding */ size_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ size_t rm_reports; /* # of referencing checksum reports */ unsigned int rm_freed; /* map no longer has referencing ZIO */ unsigned int rm_ecksuminjected; /* checksum error was injected */ diff --git a/include/sys/zio.h b/include/sys/zio.h index e12844f0c849..bd24ee8fa324 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -80,6 +80,7 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_FUNCTIONS }; @@ -295,6 +296,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; +struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -327,12 +329,12 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, struct abd *data, uint64_t offset); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -391,8 +393,8 @@ struct zio { uint64_t io_lsize; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + struct abd *io_abd; + struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; @@ -445,19 +447,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t lsize, zio_done_func_t *done, void *private, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + struct abd *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -473,12 +475,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -509,19 +511,19 @@ extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void *zio_buf_alloc_flags(size_t size, int flags); -extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 04573ba5456f..dda56df2096e 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -32,16 +32,18 @@ extern "C" { #endif +struct abd; + /* * Signature for checksum functions. */ -typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *); +typedef void zio_checksum_t(struct abd *, uint64_t, zio_cksum_t *); /* * Information about each checksum function. */ typedef const struct zio_checksum_info { - zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ + zio_checksum_t *ci_func[2]; /* checksum function per byteorder */ int ci_correctable; /* number of correctable bits */ int ci_eck; /* uses zio embedded checksum? */ boolean_t ci_dedup; /* strong enough for dedup? */ @@ -62,14 +64,18 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_func_t zio_checksum_SHA256; +extern void zio_checksum_SHA256(const void *, uint64_t, zio_cksum_t *); + +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, - void *, uint64_t, uint64_t, zio_bad_cksum_t *); + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index da58ef7aa5ec..1642823d3d42 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -22,12 +22,14 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H +#include + #ifdef __cplusplus extern "C" { #endif @@ -59,14 +61,21 @@ typedef size_t zio_compress_func_t(void *src, void *dst, typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); +/* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); /* * Information about each compression function. */ typedef const struct zio_compress_info { - zio_compress_func_t *ci_compress; /* compression function */ - zio_decompress_func_t *ci_decompress; /* decompression function */ - int ci_level; /* level parameter */ - char *ci_name; /* algorithm name */ + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; } zio_compress_info_t; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; @@ -96,13 +105,16 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); - +extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len); -extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len); #ifdef __cplusplus diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index f0cfbd57342a..3e472b1dad66 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -45,8 +45,11 @@ extern "C" { * checksum method is added. This method will ignore last (size % 4) bytes of * the data buffer. */ +void fletcher_init(zio_cksum_t *); void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); +void fletcher_2_incremental_native(void *, uint64_t, zio_cksum_t *); +void fletcher_2_incremental_byteswap(void *, uint64_t, zio_cksum_t *); void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index f56381d5871b..5123d4d2f632 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -30,6 +30,7 @@ KERNEL_C = \ zfs_uio.c \ zpool_prop.c \ zprop_common.c \ + abd.c \ arc.c \ blkptr.c \ bplist.c \ diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 5436bae9a8b9..7684497767ee 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -24,6 +24,10 @@ * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + /* * Fletcher Checksums * ------------------ @@ -207,13 +211,24 @@ static struct fletcher_4_kstat { static boolean_t fletcher_4_initialized = B_FALSE; void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_init(zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +void +fletcher_2_incremental_native(void *buf, uint64_t size, zio_cksum_t *zcp) { const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -223,14 +238,27 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); } +/*ARGSUSED*/ void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_native((void *) buf, size, zcp); +} + +void +fletcher_2_incremental_byteswap(void *buf, uint64_t size, zio_cksum_t *zcp) { const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -240,6 +268,14 @@ fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); } +/*ARGSUSED*/ +void +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); +} + static void fletcher_4_scalar_init(zio_cksum_t *zcp) { @@ -526,6 +562,8 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +typedef void fletcher_checksum_func_t(const void *, uint64_t, zio_cksum_t *); + static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -537,8 +575,9 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); - zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : - fletcher_4_byteswap; + + fletcher_checksum_func_t *fletcher_4_test = native ? + fletcher_4_native : fletcher_4_byteswap; for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; @@ -691,6 +730,9 @@ module_param_call(zfs_fletcher_4_impl, fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +EXPORT_SYMBOL(fletcher_init); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_2_native); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 07c6d1c6ca1b..ed04848fbb2c 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -7,6 +7,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += abd.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000000..a0af84770e40 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,986 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include +#include +#include +#include +#include + +#ifndef KMC_NOTOUCH +#define KMC_NOTOUCH 0 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +int zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 1024; + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +static kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +static void * +abd_alloc_chunk(void) +{ + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + return (c); +} + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +void +abd_init(void) +{ + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif + + /* + * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH + * so that no allocator metadata is stored with the buffers. + */ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + size_t n; + int i; + + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, + zfs_abd_chunk_size); + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + ASSERT3P( + abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(size_t chunkcnt) +{ + size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, size); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + int i; + size_t n; + abd_t *abd; + + if (!zfs_abd_scatter_enabled) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + n = abd_chunkcnt_for_bytes(size); + abd = abd_alloc_struct(n); + + abd->abd_flags = ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + for (i = 0; i < n; i++) { + void *c = abd_alloc_chunk(); + ASSERT3P(c, !=, NULL); + abd->abd_u.abd_scatter.abd_chunks[i] = c; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + int i; + + for (i = 0; i < n; i++) { + abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags | ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + * + * LINUX ABD TODO - once vdev_disk.c has ABD page support change to vanilla + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +abd_t * +abd_alloc_for_io_nosleep(size_t size, boolean_t is_metadata) +{ + /* + * ABD TODO -- Need to refactor to allow for nosleep option + */ + return (abd_alloc_linear(size, is_metadata)); +} + + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd->abd_u.abd_scatter.abd_offset = + new_offset % zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + abd->abd_size = sabd->abd_size - off; + abd->abd_parent = sabd; + refcount_create(&abd->abd_children); + (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +struct abd_iter { + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; /* position (relative to abd_offset) */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ +}; + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = zfs_abd_chunk_size - offset; + paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + while (size > 0) { + size_t len; + abd_iter_map(&aiter); + + len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd); + abd_iter_init(&saiter, sabd); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + size_t dlen, slen, len; + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + dlen = MIN(daiter.iter_mapsize, size); + slen = MIN(saiter.iter_mapsize, size); + len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +#endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7d8b0dca3bc4..f7ca664e6b85 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -136,14 +136,14 @@ * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pdata) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an + * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each @@ -182,7 +182,7 @@ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t - * | b_pdata +-+ |b_next +---->+-----------+ + * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | @@ -199,8 +199,8 @@ * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a - * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be @@ -224,7 +224,7 @@ * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL + * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ @@ -238,19 +238,19 @@ * | +------+ | * +---------------------------------+ * - * Writing to the ARC requires that the ARC first discard the hdr's b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pdata. Writes are always done into buffers which have + * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * - * When the L2ARC is in use, it will also take advantage of the b_pdata. The - * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the @@ -271,7 +271,9 @@ #include #include #include +#include #include +#include #ifdef _KERNEL #include #include @@ -315,7 +317,7 @@ int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 5; -/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ @@ -444,13 +446,13 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* - * Uncompressed size of the data stored in b_pdata. If compressed + * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ @@ -943,7 +945,7 @@ typedef struct l2arc_read_callback { typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; @@ -953,10 +955,14 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); -static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_pabd(arc_buf_hdr_t *); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); @@ -1318,7 +1324,9 @@ static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && - buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); @@ -1358,8 +1366,6 @@ arc_cksum_verify(arc_buf_t *buf) return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1406,7 +1412,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); lsize = HDR_GET_LSIZE(hdr); - csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* @@ -1441,7 +1448,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); @@ -1465,18 +1472,9 @@ arc_cksum_compute(arc_buf_t *buf) mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { - /* - * Since the checksum doesn't apply to compressed buffers, we - * only keep a checksum if there are uncompressed buffers. - * Therefore there must be another buffer, which is - * uncompressed. - */ - IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL, - hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } @@ -1571,8 +1569,6 @@ arc_buf_thaw(arc_buf_t *buf) * allocate b_thawed. */ if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1591,8 +1587,6 @@ arc_buf_freeze(arc_buf_t *buf) return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1722,7 +1716,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { @@ -1774,7 +1768,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, + hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* @@ -1811,7 +1805,7 @@ arc_decompress(arc_buf_t *buf) } /* - * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) @@ -1844,14 +1838,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -1879,14 +1873,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -2033,7 +2027,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2102,7 +2096,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -2132,7 +2126,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(bufcnt, ==, buffers); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { @@ -2145,7 +2139,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, @@ -2186,7 +2180,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } @@ -2284,7 +2278,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pdata. + * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) @@ -2379,17 +2373,20 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, * set the appropriate bit in the hdr's b_flags to indicate the hdr is * allocate a new buffer to store the buf's data. * - * There is one additional restriction here because we're sharing - * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively - * involved in an L2ARC write, because if this buf is used by an - * arc_write() then the hdr's data buffer will be released when the + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. */ - can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && + abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { - buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { @@ -2474,11 +2471,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) } static void -l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; + df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); @@ -2503,7 +2500,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) } (void) refcount_remove_many(&state->arcs_size, size, hdr); - l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* @@ -2515,7 +2512,7 @@ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2524,7 +2521,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pdata = buf->b_data; + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; @@ -2542,7 +2541,7 @@ static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2551,7 +2550,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - hdr->b_l1hdr.b_pdata = NULL; + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* @@ -2647,7 +2648,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pdata to share it with the new + * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * @@ -2662,8 +2663,8 @@ arc_buf_destroy_impl(arc_buf_t *buf) /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the @@ -2696,26 +2697,26 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then @@ -2727,10 +2728,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } - hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); @@ -2766,7 +2767,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2806,7 +2807,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -2824,11 +2825,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } @@ -2913,6 +2914,14 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to disk, it's + * easiest if we just set up sharing between the buf and the hdr. + */ + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + return (buf); } @@ -2981,9 +2990,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - if (hdr->b_l1hdr.b_pdata != NULL) { - arc_hdr_free_pdata(hdr); - } + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_pabd(hdr); } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -3050,7 +3058,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pdata field) during its write phase. + * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. @@ -3066,7 +3074,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); + ASSERT(hdr->b_l1hdr.b_pabd == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3131,9 +3139,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly - * in arc_free_data_buf(). + * in arc_free_data_impl(). */ - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -3231,7 +3239,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since - * arc_get_data_buf() doesn't check for overflow + * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the @@ -4110,13 +4118,13 @@ arc_kmem_reap_now(void) } /* - * Threads can block in arc_get_data_buf() waiting for this thread to evict + * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_buf() are sleeping while holding the hash lock for their + * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * - * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, @@ -4465,18 +4473,45 @@ arc_is_overflowing(void) return (arc_size >= arc_c + overflow); } +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - void *datap = NULL; - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -4518,11 +4553,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { - ASSERT(type == ARC_BUFC_DATA); - datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -4558,14 +4590,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } - return (datap); +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } } /* * Free the arc data buffer. */ static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); @@ -4582,11 +4634,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - zio_buf_free(data, size); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(data, size); arc_space_return(size, ARC_SPACE_DATA); } } @@ -4868,7 +4918,7 @@ arc_read_done(zio_t *zio) if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || @@ -4965,7 +5015,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -5117,7 +5167,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5135,9 +5185,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* @@ -5241,7 +5291,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, - size, hdr->b_l1hdr.b_pdata, + size, hdr->b_l1hdr.b_pabd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | @@ -5281,7 +5331,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { @@ -5513,16 +5563,16 @@ arc_release(arc_buf_t *buf, void *tag) arc_unshare_buf(hdr, buf); /* - * Now we need to recreate the hdr's b_pdata. Since we + * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pdata and copy the + * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pdata(hdr); - bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + arc_hdr_alloc_pabd(hdr); + bcopy(buf->b_data, hdr->b_l1hdr.b_pabd, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { @@ -5538,7 +5588,7 @@ arc_release(arc_buf_t *buf, void *tag) HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, @@ -5557,7 +5607,7 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); /* - * Allocate a new hdr. The new hdr will contain a b_pdata + * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); @@ -5646,15 +5696,15 @@ arc_write_ready(zio_t *zio) if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } } } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5676,32 +5726,45 @@ arc_write_ready(zio_t *zio) arc_hdr_set_compress(hdr, compress); /* - * If the hdr is compressed, then copy the compressed - * zio contents into arc_buf_hdr_t. Otherwise, copy the original - * data buf into the hdr. Ideally, we would like to always copy the - * io_data into b_pdata but the user may have disabled compressed - * arc thus the on-disk block may or may not match what we maintain - * in the hdr's b_pdata field. + * Fill the hdr with data. If the hdr is compressed, the data we want + * is available from the zio, otherwise we can take it from the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - !ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - arc_hdr_alloc_pdata(hdr); - bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + arc_hdr_alloc_pabd(hdr); + + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, + ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } } else { - ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - /* - * This hdr is not compressed so we're able to share - * the arc_buf_t data buffer with the hdr. - */ arc_share_buf(hdr, buf); - ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, - HDR_GET_LSIZE(hdr))); } + arc_hdr_verify(hdr, zio->io_bp); } @@ -5806,6 +5869,7 @@ arc_write_done(zio_t *zio) ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); + abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -5842,10 +5906,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_buf = buf; /* - * The hdr's b_pdata is now stale, free it now. A new data block + * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. @@ -5855,15 +5919,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, @@ -6725,13 +6790,8 @@ l2arc_do_free_on_write(void) for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_data, !=, NULL); - if (df->l2df_type == ARC_BUFC_METADATA) { - zio_buf_free(df->l2df_data, df->l2df_size); - } else { - ASSERT(df->l2df_type == ARC_BUFC_DATA); - zio_data_buf_free(df->l2df_data, df->l2df_size); - } + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6885,12 +6945,12 @@ l2arc_read_done(zio_t *zio) mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(zio->io_data, !=, NULL); + ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ @@ -6924,7 +6984,7 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -7144,7 +7204,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t asize, size; - void *to_write; + abd_t *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -7217,7 +7277,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); size = arc_hdr_size(hdr); @@ -7233,18 +7293,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * add it to the l2arc_free_on_write queue. */ if (!HDR_SHARED_DATA(hdr)) { - to_write = hdr->b_l1hdr.b_pdata; + to_write = hdr->b_l1hdr.b_pabd; } else { - arc_buf_contents_t type = arc_buf_type(hdr); - if (type == ARC_BUFC_METADATA) { - to_write = zio_buf_alloc(size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - to_write = zio_data_buf_alloc(size); - } - - bcopy(hdr->b_l1hdr.b_pdata, to_write, size); - l2arc_free_data_on_write(to_write, size, type); + to_write = abd_alloc_for_io(size, + HDR_ISTYPE_METADATA(hdr)); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + l2arc_free_abd_on_write(to_write, size, + arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, size, to_write, diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c index d56e19996d8d..99accfa0fe8a 100644 --- a/module/zfs/blkptr.c +++ b/module/zfs/blkptr.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1a765022c891..5b1cede9e956 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -46,6 +46,7 @@ #include #include #include +#include struct dbuf_hold_impl_data { /* Function arguments */ @@ -3690,6 +3691,9 @@ dbuf_write_override_done(zio_t *zio) mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); } /* Issue I/O to commit a dirty buffer to disk. */ @@ -3782,7 +3786,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - void *contents = (data != NULL) ? data->b_data : NULL; + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 12c1b7300a21..2e760f2e03ea 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include @@ -36,6 +36,7 @@ #include #include #include +#include static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_cache; @@ -705,9 +706,8 @@ ddt_free(ddt_entry_t *dde) for (p = 0; p < DDT_PHYS_TYPES; p++) ASSERT(dde->dde_lead_zio[p] == NULL); - if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); cv_destroy(&dde->dde_cv); kmem_cache_free(ddt_entry_cache, dde); @@ -992,7 +992,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ddt_enter(ddt); - if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else @@ -1030,7 +1030,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 21b5534df790..01e8cf3fb3cb 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -1500,6 +1501,7 @@ dmu_sync_late_arrival_done(zio_t *zio) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } @@ -1524,11 +1526,11 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), - zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, - zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, - NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL, zb)); + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } @@ -2044,6 +2046,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); @@ -2069,6 +2072,7 @@ dmu_fini(void) xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); + abd_fini(); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b14bcec4b6df..e77da4f7af09 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -166,7 +166,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type != DRR_BEGIN) { @@ -174,13 +174,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } - fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - fletcher_4_incremental_native(payload, payload_len, + (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); @@ -1778,11 +1778,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - fletcher_4_incremental_byteswap(drr_begin, + (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - fletcher_4_incremental_native(drr_begin, + (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); @@ -2466,9 +2466,9 @@ static void receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { - fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { - fletcher_4_incremental_native(buf, len, &ra->cksum); + (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 41b3ce79b04d..fd7a53bc93e2 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2016 Gary Mills */ @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -1820,7 +1821,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1904,7 +1905,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) @@ -1919,9 +1919,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(scan_delay); - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); + zio_nowait(zio_read(NULL, spa, bp, + abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, + NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c index cf9dd8fcba1a..25fa4b187617 100644 --- a/module/zfs/sha256.c +++ b/module/zfs/sha256.c @@ -22,10 +22,56 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ #include #include #include +#include + +#if 0 +/* ADB bringup -- waiting for PR-4760 */ +static int +sha_incremental(void *buf, size_t size, void *arg) +{ + SHA2_CTX *ctx = arg; + SHA2Update(ctx, buf, size); + return (0); +} +#endif + +/*ARGSUSED*/ +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ +#if 1 + /* ADB bringup -- waiting for PR-4760 */ + void *loaner = abd_borrow_buf_copy(abd, size); + zio_checksum_SHA256(loaner, size, zcp); + abd_return_buf(abd, loaner, size); +#else + SHA2_CTX ctx; + zio_cksum_t tmp; + + SHA2Init(SHA256, &ctx); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); + SHA2Final(&tmp, &ctx); + + /* + * A prior implementation of this function had a + * private SHA256 implementation always wrote things out in + * Big Endian and there wasn't a byteswap variant of it. + * To preserve on disk compatibility we need to force that + * behavior. + */ + zcp->zc_word[0] = BE_64(tmp.zc_word[0]); + zcp->zc_word[1] = BE_64(tmp.zc_word[1]); + zcp->zc_word[2] = BE_64(tmp.zc_word[2]); + zcp->zc_word[3] = BE_64(tmp.zc_word[3]); +#endif +} /* * SHA-256 checksum, as specified in FIPS 180-3, available at: diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3b63427c2c96..4af5543cfa23 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1955,6 +1955,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; spa_t *spa = zio->io_spa; + abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) @@ -1962,7 +1963,6 @@ spa_load_verify_done(zio_t *zio) else atomic_inc_64(&sle->sle_data_count); } - zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1985,7 +1985,6 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { zio_t *rio; size_t size; - void *data; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -1996,12 +1995,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (!spa_load_verify_metadata) return (0); - if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -2009,7 +2007,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(rio, spa, bp, data, size, + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 75f6e5ce11ae..17ad08c8481c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -43,6 +43,7 @@ #include #include #include +#include #include /* @@ -978,16 +979,16 @@ vdev_probe_done(zio_t *zio) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, - zio->io_offset, zio->io_size, zio->io_data, + zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; @@ -1103,8 +1104,8 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index e802240c12bd..65bec7d935a6 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -31,6 +31,7 @@ #include #include #include +#include /* * Virtual device read-ahead caching. @@ -141,12 +142,12 @@ static void vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) { ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - ASSERT(ve->ve_data != NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_abd); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -176,14 +177,14 @@ vdev_cache_allocate(zio_t *zio) ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); - ASSERT(ve->ve_hits != 0); + ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -197,7 +198,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); @@ -206,7 +207,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); } /* @@ -220,16 +221,16 @@ vdev_cache_fill(zio_t *fio) vdev_cache_entry_t *ve = fio->io_private; zio_t *pio; - ASSERT(fio->io_size == VCBS); + ASSERT3U(fio->io_size, ==, VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == fio); - ASSERT(ve->ve_offset == fio->io_offset); - ASSERT(ve->ve_data == fio->io_data); + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); ve->ve_fill_io = NULL; @@ -259,7 +260,7 @@ vdev_cache_read(zio_t *zio) zio_t *fio; ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); - ASSERT(zio->io_type == ZIO_TYPE_READ); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (B_FALSE); @@ -273,7 +274,7 @@ vdev_cache_read(zio_t *zio) if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (B_FALSE); - ASSERT(cache_phase + zio->io_size <= VCBS); + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); mutex_enter(&vc->vc_lock); @@ -312,7 +313,7 @@ vdev_cache_read(zio_t *zio) } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; @@ -340,7 +341,7 @@ vdev_cache_write(zio_t *zio) uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); mutex_enter(&vc->vc_lock); @@ -357,8 +358,8 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_abd, zio->io_abd, start - io_start, + start - ve->ve_offset, end - start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index e399e6630639..40c7ae648fea 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static void *zfs_vdev_holder = VDEV_HOLDER; */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ + void *dr_loanbuf; /* borrowed abd buffer */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ @@ -402,6 +404,7 @@ vdev_disk_dio_put(dio_request_t *dr) */ if (rc == 0) { zio_t *zio = dr->dr_zio; + void *loanbuf = dr->dr_loanbuf; int error = dr->dr_error; vdev_disk_dio_free(dr); @@ -411,6 +414,15 @@ vdev_disk_dio_put(dio_request_t *dr) ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); + /* ABD placeholder */ + if (loanbuf != NULL) { + if (zio->io_type == ZIO_TYPE_READ) { + abd_copy_from_buf(zio->io_abd, loanbuf, + zio->io_size); + } + zio_buf_free(loanbuf, zio->io_size); + } + zio_delay_interrupt(zio); } } @@ -544,7 +556,30 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; + if (zio != NULL) { + abd_t *abd = zio->io_abd; + + /* + * ABD placeholder + * We can't use abd_borrow_buf routines here since our + * completion context is interrupt and abd refcounts + * take a mutex (in debug mode). + */ + if (abd_is_linear(abd)) { + bio_ptr = abd_to_buf(abd); + dr->dr_loanbuf = NULL; + } else { + bio_ptr = zio_buf_alloc(zio->io_size); + dr->dr_loanbuf = bio_ptr; + if (zio->io_type != ZIO_TYPE_READ) + abd_copy_to_buf(bio_ptr, abd, zio->io_size); + + } + } else { + bio_ptr = kbuf_ptr; + dr->dr_loanbuf = NULL; + } + bio_offset = kbuf_offset; bio_size = kbuf_size; for (i = 0; i <= dr->dr_bio_count; i++) { @@ -559,6 +594,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * are needed we allocate a larger dio and warn the user. */ if (dr->dr_bio_count == i) { + if (dr->dr_loanbuf) + zio_buf_free(dr->dr_loanbuf, zio->io_size); vdev_disk_dio_free(dr); bio_count *= 2; goto retry; @@ -568,6 +605,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, dr->dr_bio[i] = bio_alloc(GFP_NOIO, MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); if (unlikely(dr->dr_bio[i] == NULL)) { + if (dr->dr_loanbuf) + zio_buf_free(dr->dr_loanbuf, zio->io_size); vdev_disk_dio_free(dr); return (ENOMEM); } @@ -717,7 +756,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, NULL, zio->io_size, zio->io_offset, rw, flags); if (error) { zio->io_error = error; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index bca4175a6ff4..63e716b53d34 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -31,6 +31,7 @@ #include #include #include +#include /* * Virtual device vector for files. @@ -150,11 +151,21 @@ vdev_file_io_strategy(void *arg) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 1925c67edd1b..1be501855ad2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -145,6 +145,7 @@ #include #include #include +#include #include /* @@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -583,6 +584,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -595,7 +597,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -603,7 +606,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -642,7 +645,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); return (config); } @@ -778,8 +781,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - char *pad2; + abd_t *vp_abd; + abd_t *pad2; uberblock_t *ub; + abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -863,8 +868,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. @@ -924,7 +930,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -932,14 +938,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); - bzero(ub, VDEV_UBERBLOCK_RING); - *ub = spa->spa_uberblock; + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); - bzero(pad2, VDEV_PAD_SIZE); + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -949,7 +956,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -962,7 +969,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -975,9 +982,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -1041,7 +1048,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -1062,7 +1069,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } static void @@ -1078,8 +1085,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } @@ -1146,7 +1153,7 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { - uberblock_t *ubbuf; + abd_t *ub_abd; int c, l, n; for (c = 0; c < vd->vdev_children; c++) @@ -1160,17 +1167,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); - *ubbuf = *ub; + /* Copy the uberblock_t into the ABD */ + ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1247,6 +1255,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1265,15 +1274,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1281,7 +1291,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); nvlist_free(label); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index d3dbdca79a42..101030e61832 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -31,6 +31,7 @@ #include #include #include +#include #include /* @@ -271,13 +272,13 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -432,7 +433,8 @@ vdev_mirror_io_start(zio_t *zio) mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } @@ -457,7 +459,7 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; @@ -542,7 +544,7 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; @@ -583,7 +585,7 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 8522c2627fe6..060dbf0919f3 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * ZFS I/O Scheduler @@ -487,12 +488,12 @@ vdev_queue_agg_io_done(zio_t *aio) if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_abd, aio->io_abd, + 0, pio->io_offset - aio->io_offset, pio->io_size); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_abd); } /* @@ -514,7 +515,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; - void *buf; + abd_t *abd; limit = MAX(MIN(zfs_vdev_aggregation_limit, spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0); @@ -617,12 +618,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, limit); - buf = zio_buf_alloc_flags(size, KM_NOSLEEP); - if (buf == NULL) + abd = abd_alloc_for_io_nosleep(size, B_TRUE); + if (abd == NULL) return (NULL); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - buf, size, first->io_type, zio->io_priority, + abd, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -635,12 +636,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_abd, + dio->io_offset - aio->io_offset, dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_abd, dio->io_abd, + dio->io_offset - aio->io_offset, 0, dio->io_size); } zio_add_child(dio, aio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index f5df2c7d8970..23c3d8832838 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -136,7 +137,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -144,11 +145,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_abd); size += rm->rm_col[c].rc_size; + } - if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -185,7 +188,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -199,8 +202,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -208,15 +212,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_abd = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -226,13 +235,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_data = bad_parity[x]; + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset( + rm->rm_abd_copy, offset); + offset += rm->rm_col[x].rc_size; } } @@ -246,8 +259,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); } /* @@ -260,7 +275,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -274,7 +289,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); - if (rm->rm_datacopy != NULL) + if (rm->rm_abd_copy != NULL) return; /* @@ -290,17 +305,20 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + rm->rm_abd_copy = + abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -329,6 +347,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -373,7 +392,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - rm->rm_datacopy = NULL; + rm->rm_abd_copy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; @@ -389,7 +408,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_abd = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; @@ -412,13 +431,16 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, 0); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, off); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -464,29 +486,84 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -494,50 +571,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -546,59 +616,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -630,40 +689,159 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + int i; + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + int j; + uint8_t *b; + + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); - src = rm->rm_col[VDEV_RAIDZ_P].rc_data; - dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -672,57 +850,46 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); - count = MIN(ccount, xcount); + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - dst = rm->rm_col[x].rc_data; + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; exp = 255 - (rm->rm_cols - 1 - x); + rq.q = abd_to_buf(src); + rq.exp = exp; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -730,11 +897,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -750,15 +919,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); - rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -767,12 +936,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xd = rm->rm_col[x].rc_data; - yd = rm->rm_col[y].rc_data; + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; /* * We now have: @@ -796,24 +965,27 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + ASSERT3U(xsize, >=, ysize); + rpq.p = p; + rpq.q = q; + rpq.pxy = pxy; + rpq.qxy = qxy; + rpq.aexp = aexp; + rpq.bexp = bexp; - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, - rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1131,7 +1303,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = abd_to_buf(rm->rm_col[c].rc_abd); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1139,7 +1311,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1187,8 +1359,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1275,6 +1464,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1321,7 +1524,6 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; - /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) @@ -1479,7 +1681,7 @@ vdev_raidz_io_start(zio_t *zio) rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1536,7 +1738,7 @@ vdev_raidz_io_start(zio_t *zio) if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1552,6 +1754,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1565,9 +1768,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_abd, buf, rc->rc_size); } } @@ -1609,7 +1814,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1618,7 +1823,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1721,7 +1926,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_abd, + rc->rc_size); } /* @@ -1751,7 +1957,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_abd, orig[i], + rc->rc_size); } do { @@ -1990,7 +2197,7 @@ vdev_raidz_io_done(zio_t *zio) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); @@ -2070,7 +2277,7 @@ vdev_raidz_io_done(zio_t *zio) continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index df9dc93867b9..f0a2d0d9ab01 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -44,6 +44,16 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = { .name = "fastest" }; +/* ABD BRINGUP -- not ready yet */ +#if 1 +#ifdef HAVE_SSSE3 +#undef HAVE_SSSE3 +#endif +#ifdef HAVE_AVX2 +#undef HAVE_AVX2 +#endif +#endif + /* All compiled in implementations */ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_original_impl, @@ -139,6 +149,8 @@ vdev_raidz_math_generate(raidz_map_t *rm) { raidz_gen_f gen_parity = NULL; +/* ABD Bringup -- vector code not ready */ +#if 0 switch (raidz_parity(rm)) { case 1: gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; @@ -155,6 +167,7 @@ vdev_raidz_math_generate(raidz_map_t *rm) raidz_parity(rm)); break; } +#endif /* if method is NULL execute the original implementation */ if (gen_parity == NULL) @@ -165,6 +178,8 @@ vdev_raidz_math_generate(raidz_map_t *rm) return (0); } +/* ABD Bringup -- vector code not ready */ +#if 0 static raidz_rec_f reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, const int nbaddata) @@ -219,6 +234,7 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, } return ((raidz_rec_f) NULL); } +#endif /* * Select data reconstruction method for raidz_map @@ -232,6 +248,8 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, { raidz_rec_f rec_data = NULL; +/* ABD Bringup -- vector code not ready */ +#if 0 switch (raidz_parity(rm)) { case PARITY_P: rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); @@ -247,6 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, raidz_parity(rm)); break; } +#endif if (rec_data == NULL) return (RAIDZ_ORIGINAL_IMPL); @@ -461,13 +480,12 @@ vdev_raidz_math_init(void) return; #endif - /* Fake an zio and run the benchmark on it */ + /* Fake an zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ - bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE); - VERIFY(bench_zio->io_data); - memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { @@ -491,7 +509,7 @@ vdev_raidz_math_init(void) vdev_raidz_map_free(bench_rm); /* cleanup the bench zio */ - zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE); + abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); /* install kstats for all impl */ diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c index 9ca1688c1f49..4298f986f39e 100644 --- a/module/zfs/vdev_raidz_math_avx2.c +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -21,7 +21,6 @@ /* * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ - #include #if defined(__x86_64) && defined(HAVE_AVX2) @@ -381,7 +380,12 @@ DEFINE_REC_METHODS(avx2); static boolean_t raidz_will_avx2_work(void) { +/* ABD Bringup -- vector code not ready */ +#if 1 + return (B_FALSE); +#else return (zfs_avx_available() && zfs_avx2_available()); +#endif } const raidz_impl_ops_t vdev_raidz_avx2_impl = { diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 67cad76c1d32..5db44e47da36 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -33,7 +33,8 @@ #endif /* Calculate data offset in raidz column, offset is in bytes */ -#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_data) + (off))) +/* ADB BRINGUP -- needs to be refactored for ABD */ +#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_abd) + (off))) /* * PARITY CALCULATION @@ -83,6 +84,8 @@ raidz_generate_p_impl(raidz_map_t * const rm) const size_t psize = raidz_big_size(rm); const size_t short_size = raidz_short_size(rm); + panic("not ABD ready"); + raidz_math_begin(); /* short_size */ @@ -141,6 +144,8 @@ raidz_generate_pq_impl(raidz_map_t * const rm) const size_t psize = raidz_big_size(rm); const size_t short_size = raidz_short_size(rm); + panic("not ABD ready"); + raidz_math_begin(); /* short_size */ @@ -208,6 +213,8 @@ raidz_generate_pqr_impl(raidz_map_t * const rm) const size_t psize = raidz_big_size(rm); const size_t short_size = raidz_short_size(rm); + panic("not ABD ready"); + raidz_math_begin(); /* short_size */ diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c index 540cb9314fd4..d6d5fe60b113 100644 --- a/module/zfs/vdev_raidz_math_scalar.c +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -24,7 +24,6 @@ */ #include - /* * Provide native CPU scalar routines. * Support 32bit and 64bit CPUs. diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index 982e27efc487..12f406da2b76 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -383,8 +383,13 @@ DEFINE_REC_METHODS(ssse3); static boolean_t raidz_will_ssse3_work(void) { +/* ABD Bringup -- vector code not ready */ +#if 1 + return (B_FALSE); +#else return (zfs_sse_available() && zfs_sse2_available() && zfs_ssse3_available()); +#endif } const raidz_impl_ops_t vdev_raidz_ssse3_impl = { diff --git a/module/zfs/zil.c b/module/zfs/zil.c index c18807be0ae6..8ac46e6a50ae 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -889,6 +890,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -925,12 +927,14 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 892b86fbaf43..19cb6d0de3c7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -41,6 +41,7 @@ #include #include #include +#include /* * ========================================================================== @@ -60,6 +61,11 @@ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#endif + int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 @@ -202,6 +208,13 @@ zio_fini(void) */ if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; +#endif +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) + (void) printf("zio_fini: [%d] %llu != %llu\n", + (int)((c + 1) << SPA_MINBLOCKSHIFT), + (long long unsigned)zio_buf_cache_allocs[c], + (long long unsigned)zio_buf_cache_frees[c]); #endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; @@ -242,6 +255,9 @@ zio_buf_alloc(size_t size) size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_allocs[c], 1); +#endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } @@ -272,6 +288,9 @@ zio_buf_alloc_flags(size_t size, int flags) size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_allocs[c], 1); +#endif return (kmem_cache_alloc(zio_buf_cache[c], flags)); } @@ -282,6 +301,9 @@ zio_buf_free(void *buf, size_t size) size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_frees[c], 1); +#endif kmem_cache_free(zio_buf_cache[c], buf); } @@ -302,12 +324,18 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - zt->zt_orig_data = zio->io_data; + /* + * Ensure that anyone expecting this zio to contain a linear ABD isn't + * going to get a nasty surprise when they try to access the data. + */ + IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); + + zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; @@ -315,7 +343,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; - zio->io_data = data; + zio->io_abd = data; zio->io_size = size; } @@ -327,12 +355,12 @@ zio_pop_transforms(zio_t *zio) while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, - zt->zt_orig_data, zt->zt_orig_size); + zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_abd); - zio->io_data = zt->zt_orig_data; + zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; @@ -346,21 +374,26 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_abd, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size); + abd_return_buf_copy(data, tmp, size); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } } /* @@ -529,7 +562,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, @@ -588,7 +621,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; - zio->io_orig_data = zio->io_data = data; + zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; @@ -731,7 +764,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -749,7 +782,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -790,7 +823,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -942,7 +975,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -963,7 +996,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -986,8 +1019,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -999,7 +1033,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -1043,7 +1077,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -1104,14 +1138,17 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); - - zio_push_transform(zio, cbuf, psize, psize, zio_decompress); + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1229,7 +1266,7 @@ zio_write_bp_init(zio_t *zio) /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1267,9 +1304,11 @@ zio_write_bp_init(zio_t *zio) zio_buf_free(cbuf, lsize); psize = lsize; } else { - bzero((char *)cbuf + psize, rounded - psize); + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); psize = rounded; - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1853,26 +1892,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1883,8 +1934,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1894,7 +1949,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1902,16 +1958,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -1975,13 +2033,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -1998,13 +2057,16 @@ zio_gang_tree_assemble_done(zio_t *zio) if (zio->io_error) return; + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_abd); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -2014,7 +2076,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2028,7 +2091,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -2037,13 +2100,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -2076,7 +2140,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -2116,6 +2181,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static int zio_write_gang_block(zio_t *pio) { @@ -2125,6 +2196,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -2151,12 +2223,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -2176,9 +2250,9 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, - lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, - &gn->gn_child[g], pio->io_priority, + abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, + lsize, &zp, zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } @@ -2270,10 +2344,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ - if (zio->io_error == 0 && dde->dde_repair_data == NULL) - dde->dde_repair_data = zio->io_data; + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } @@ -2306,16 +2381,16 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, - zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); @@ -2345,8 +2420,9 @@ zio_ddt_read_done(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } - if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2377,8 +2453,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, - zio->io_orig_size) != 0); + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } @@ -2402,7 +2477,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || - bcmp(abuf->b_data, zio->io_orig_data, + abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); arc_buf_destroy(abuf, &abuf); @@ -2563,12 +2638,12 @@ zio_ddt_write(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } @@ -2585,13 +2660,13 @@ zio_ddt_write(zio_t *zio) ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } @@ -2846,11 +2921,11 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -2980,7 +3055,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3114,7 +3189,7 @@ zio_checksum_generate(zio_t *zio) } } - zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } @@ -3238,7 +3313,7 @@ zio_ready(zio_t *zio) if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { - ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } @@ -3253,6 +3328,7 @@ zio_ready(zio_t *zio) static int zio_done(zio_t *zio) { + uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; @@ -3305,28 +3381,35 @@ zio_done(zio_t *zio) while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; - - if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + uint64_t asize = P2ROUNDUP(psize, align); + char *abuf = NULL; + abd_t *adata = zio->io_abd; + + if (asize != psize) { + adata = abd_alloc_linear(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); } + if (adata != NULL) + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); - if (asize != zio->io_size) - zio_buf_free(abuf, asize); + if (adata != NULL) + abd_return_buf(adata, abuf, asize); + + if (asize != psize) + abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ - vdev_stat_update(zio, zio->io_size); + vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index b05e787dcaac..b8d51197c359 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -62,22 +63,107 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +/*ARGSUSED*/ +static int +fletcher_2_incremental_native_compat(void *buf, size_t size, void *zcp) +{ + fletcher_2_incremental_native(buf, size, (zio_cksum_t *)zcp); + return (0); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native_compat, zcp); +} + +/*ARGSUSED*/ +static int +fletcher_2_incremental_byteswap_compat(void *buf, size_t size, void *zcp) +{ + fletcher_2_incremental_byteswap(buf, size, (zio_cksum_t *) zcp); + return (0); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap_compat, zcp); +} + +/*ARGSUSED*/ +static int +fletcher_4_incremental_native_compat(void *buf, size_t size, void *zcp) +{ + fletcher_4_incremental_native(buf, size, (zio_cksum_t *)zcp); + return (0); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_native_compat, zcp); +} + +/*ARGSUSED*/ +static int +fletcher_4_incremental_byteswap_compat(void *buf, size_t size, void *zcp) +{ + fletcher_4_incremental_byteswap(buf, size, (zio_cksum_t *)zcp); + return (0); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_byteswap_compat, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, 0, 0, 0, "inherit"}, {{NULL, NULL}, 0, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, + {{abd_checksum_off, abd_checksum_off}, 0, 0, 0, "off"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "label"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "gang_header"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 1, 0, "zilog"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 0, 0, + "abd_fletcher2"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 1, 0, 0, + "abd_fletcher4"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 0, 1, "sha256"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, + {{abd_checksum_off, abd_checksum_off}, 0, 0, 0, "noparity"}, + +#ifdef _HAS_ZOL_PR_4760_ + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif /* _HAS_ZOL_PR_4760_ */ }; enum zio_checksum @@ -150,7 +236,7 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *abd, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -162,6 +248,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_eck) { zio_eck_t *eck; + void *data = abd_to_buf(abd); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -179,16 +266,16 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](data, size, &cksum); + ci->ci_func[0](abd, size, &cksum); eck->zec_cksum = cksum; } else { - ci->ci_func[0](data, size, &bp->blk_cksum); + ci->ci_func[0](abd, size, &bp->blk_cksum); } } int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) + abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; @@ -200,25 +287,32 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (ci->ci_eck) { zio_eck_t *eck; zio_cksum_t verifier; + size_t eck_offset; + uint64_t data_size = size; + void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) + if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; - else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) + } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); - else + } else { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } - if (nused > size) + if (nused > data_size) { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) @@ -233,10 +327,14 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - ci->ci_func[byteswap](data, size, &actual_cksum); - eck->zec_cksum = expected_cksum; + abd_return_buf_copy(abd, data, data_size); + + ci->ci_func[byteswap](abd, size, &actual_cksum); + abd_copy_from_buf_off(abd, &expected_cksum, + eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, @@ -245,7 +343,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](data, size, &actual_cksum); + ci->ci_func[byteswap](abd, size, &actual_cksum); } if (info != NULL) { @@ -273,7 +371,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 6b8d6c39bd91..7e44d16e403e 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -28,7 +28,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -41,24 +41,23 @@ /* * Compression vectors. */ - zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {NULL, NULL, 0, "inherit"}, - {NULL, NULL, 0, "on"}, - {NULL, NULL, 0, "uncompressed"}, - {lzjb_compress, lzjb_decompress, 0, "lzjb"}, - {NULL, NULL, 0, "empty"}, - {gzip_compress, gzip_decompress, 1, "gzip-1"}, - {gzip_compress, gzip_decompress, 2, "gzip-2"}, - {gzip_compress, gzip_decompress, 3, "gzip-3"}, - {gzip_compress, gzip_decompress, 4, "gzip-4"}, - {gzip_compress, gzip_decompress, 5, "gzip-5"}, - {gzip_compress, gzip_decompress, 6, "gzip-6"}, - {gzip_compress, gzip_decompress, 7, "gzip-7"}, - {gzip_compress, gzip_decompress, 8, "gzip-8"}, - {gzip_compress, gzip_decompress, 9, "gzip-9"}, - {zle_compress, zle_decompress, 64, "zle"}, - {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, + {"inherit", 0, NULL, NULL}, + {"on", 0, NULL, NULL}, + {"uncompressed", 0, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress}, + {"empty", 0, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress}, + {"gzip-2", 2, gzip_compress, gzip_decompress}, + {"gzip-3", 3, gzip_compress, gzip_decompress}, + {"gzip-4", 4, gzip_compress, gzip_decompress}, + {"gzip-5", 5, gzip_compress, gzip_decompress}, + {"gzip-6", 6, gzip_compress, gzip_decompress}, + {"gzip-7", 7, gzip_compress, gzip_decompress}, + {"gzip-8", 8, gzip_compress, gzip_decompress}, + {"gzip-9", 9, gzip_compress, gzip_decompress}, + {"zle", 64, zle_compress, zle_decompress}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs} }; enum zio_compress @@ -85,12 +84,26 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + uint64_t *word; + + for (word = data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + size_t -zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) { - uint64_t *word, *word_end; size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; + void *tmp; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); @@ -99,12 +112,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by returning zero size. */ - word_end = (uint64_t *)((char *)src + s_len); - for (word = src; word < word_end; word++) - if (*word != 0) - break; - - if (word == word_end) + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) return (0); if (c == ZIO_COMPRESS_EMPTY) @@ -112,7 +120,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); - c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); + + /* No compression algorithms can read from ABDs directly */ + tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + abd_return_buf(src, tmp, s_len); if (c_len > d_len) return (s_len); @@ -122,13 +134,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) } int -zio_decompress_data(enum zio_compress c, void *src, void *dst, +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len) { zio_compress_info_t *ci = &zio_compress_table[c]; - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + abd_return_buf(src, tmp, s_len); + + return (ret); +} From 593fd67078181df00c0c2ab575d59782dbb024e5 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Tue, 2 Aug 2016 15:28:40 -0400 Subject: [PATCH 06/11] DLPX-45330 dcenter hang due to kmem reclaim --- module/zfs/abd.c | 2 +- module/zfs/arc.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index a0af84770e40..b1c66c7ee48e 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -167,7 +167,7 @@ size_t zfs_abd_chunk_size = 1024; extern vmem_t *zio_alloc_arena; #endif -static kmem_cache_t *abd_chunk_cache; +kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; static void * diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f7ca664e6b85..9653275e9fc1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4079,6 +4079,7 @@ arc_kmem_reap_now(void) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; + extern kmem_cache_t *abd_chunk_cache; if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { /* @@ -4103,6 +4104,7 @@ arc_kmem_reap_now(void) kmem_cache_reap_now(zio_data_buf_cache[i]); } } + kmem_cache_reap_now(abd_chunk_cache); kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); From d84a47b8d8e0e49440935cbca09dcaa96ef8b237 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Tue, 2 Aug 2016 15:30:08 -0400 Subject: [PATCH 07/11] DLPX-45422 zfs_abd_scatter_enabled=0 causes assertion failure in arc_share_buf --- module/zfs/arc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9653275e9fc1..900eb5b81c9b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2914,13 +2914,17 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - /* - * To ensure that the hdr has the correct data in it if we call - * arc_decompress() on this buf before it's been written to disk, it's - * easiest if we just set up sharing between the buf and the hdr. - */ - arc_hdr_free_pabd(hdr); - arc_share_buf(hdr, buf); + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + } return (buf); } From a159e61ff50b9d8cd6f764495b8c17236294875e Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 5 Aug 2016 10:47:31 -0400 Subject: [PATCH 08/11] Convert ABD to use page structures instead of kmem_cache items --- include/sys/abd.h | 9 +-- module/zfs/abd.c | 149 +++++++++++++++++++++++++++------------------- module/zfs/arc.c | 2 - 3 files changed, 94 insertions(+), 66 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index ea3f7e2fb222..bc2cc62a131a 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -31,6 +31,7 @@ #include #include #ifdef _KERNEL +#include #include #endif @@ -51,12 +52,12 @@ typedef struct abd { refcount_t abd_children; union { struct abd_scatter { - uint_t abd_offset; - uint_t abd_chunk_size; - void *abd_chunks[]; + uint_t abd_offset; + uint_t abd_chunk_size; + struct page *abd_chunks[]; } abd_scatter; struct abd_linear { - void *abd_buf; + void *abd_buf; } abd_linear; } abd_u; } abd_t; diff --git a/module/zfs/abd.c b/module/zfs/abd.c index b1c66c7ee48e..dc4c632b87f4 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -59,10 +59,30 @@ * +----------------->| chunk N-1 | * +-----------+ * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we weren't using segkpm (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs + * on each ABD access. (If segkpm isn't available then we use all linear + * ABDs to avoid this penalty.) See seg_kpm.c for more details. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not + * available, which is the case on all 32-bit systems and any 64-bit systems + * where kpm_enable is turned off. * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset @@ -146,60 +166,46 @@ static abd_stats_t abd_stats = { #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ +/* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 1024; #ifdef _KERNEL -extern vmem_t *zio_alloc_arena; -#endif - -kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; -static void * +static struct page * abd_alloc_chunk(void) { - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + struct page *c = alloc_page(kmem_flags_convert(KM_SLEEP)); ASSERT3P(c, !=, NULL); return (c); } static void -abd_free_chunk(void *c) +abd_free_chunk(struct page *c) { - kmem_cache_free(abd_chunk_cache, c); + __free_pages(c, 0); } -void -abd_init(void) +static void * +abd_map_chunk(struct page *c) { - vmem_t *data_alloc_arena = NULL; - -#ifdef _KERNEL - data_alloc_arena = zio_alloc_arena; -#endif - /* - * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH - * so that no allocator metadata is stored with the buffers. + * Use of segkpm means we don't care if this is mapped S_READ or S_WRITE + * but S_WRITE is conceptually more accurate. */ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); + return (kmap(c)); +} + +static void +abd_unmap_chunk(struct page *c) +{ + kunmap(c); +} +void +abd_init(void) +{ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { @@ -215,15 +221,37 @@ abd_fini(void) kstat_delete(abd_ksp); abd_ksp = NULL; } +} + +#else - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; +struct page; +#define kpm_enable 1 +#define abd_alloc_chunk() \ + ((struct page *)kmem_alloc(PAGESIZE, KM_SLEEP)) +#define abd_free_chunk(chunk) kmem_free(chunk, PAGESIZE) +#define abd_map_chunk(chunk) ((void *)chunk) +static void +abd_unmap_chunk(struct page *c) +{ } +void +abd_init(void) +{ +} + +void +abd_fini(void) +{ +} + +#endif /* _KERNEL */ + static inline size_t abd_chunkcnt_for_bytes(size_t size) { - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); } static inline size_t @@ -249,8 +277,7 @@ abd_verify(abd_t *abd) size_t n; int i; - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, PAGESIZE); n = abd_scatter_chunkcnt(abd); for (i = 0; i < n; i++) { ASSERT3P( @@ -307,7 +334,7 @@ abd_alloc(size_t size, boolean_t is_metadata) refcount_create(&abd->abd_children); abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = PAGESIZE; for (i = 0; i < n; i++) { void *c = abd_alloc_chunk(); @@ -318,7 +345,7 @@ abd_alloc(size_t size, boolean_t is_metadata) ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); + n * PAGESIZE - size); return (abd); } @@ -337,7 +364,7 @@ abd_free_scatter(abd_t *abd) ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); + abd->abd_size - n * PAGESIZE); abd_free_struct(abd); } @@ -477,7 +504,7 @@ abd_get_offset(abd_t *sabd, size_t off) } else { size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); + (new_offset / PAGESIZE); abd = abd_alloc_struct(chunkcnt); @@ -488,14 +515,12 @@ abd_get_offset(abd_t *sabd, size_t off) */ abd->abd_flags = 0; - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_offset = new_offset % PAGESIZE; + abd->abd_u.abd_scatter.abd_chunk_size = PAGESIZE; /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / PAGESIZE], chunkcnt * sizeof (void *)); } @@ -673,7 +698,7 @@ abd_iter_scatter_chunk_offset(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); + aiter->iter_pos) % PAGESIZE); } static inline size_t @@ -681,7 +706,7 @@ abd_iter_scatter_chunk_index(struct abd_iter *aiter) { ASSERT(!abd_is_linear(aiter->iter_abd)); return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); + aiter->iter_pos) / PAGESIZE); } /* @@ -728,10 +753,6 @@ abd_iter_map(struct abd_iter *aiter) ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - /* There's nothing left to iterate over, so do nothing */ if (aiter->iter_pos == aiter->iter_abd->abd_size) return; @@ -743,8 +764,9 @@ abd_iter_map(struct abd_iter *aiter) } else { size_t index = abd_iter_scatter_chunk_index(aiter); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = zfs_abd_chunk_size - offset; - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + aiter->iter_mapsize = PAGESIZE - offset; + paddr = abd_map_chunk( + aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]); } aiter->iter_mapaddr = (char *)paddr + offset; } @@ -760,6 +782,13 @@ abd_iter_unmap(struct abd_iter *aiter) if (aiter->iter_pos == aiter->iter_abd->abd_size) return; + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + size_t index = abd_iter_scatter_chunk_index(aiter); + abd_unmap_chunk( + aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]); + } + ASSERT3P(aiter->iter_mapaddr, !=, NULL); ASSERT3U(aiter->iter_mapsize, >, 0); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 900eb5b81c9b..4fd38323185f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4083,7 +4083,6 @@ arc_kmem_reap_now(void) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; - extern kmem_cache_t *abd_chunk_cache; if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { /* @@ -4108,7 +4107,6 @@ arc_kmem_reap_now(void) kmem_cache_reap_now(zio_data_buf_cache[i]); } } - kmem_cache_reap_now(abd_chunk_cache); kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); From 63780e0e04818f5e3b10a864c64e0f9aa4649cc6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 18 Aug 2016 09:06:09 -0700 Subject: [PATCH 09/11] Fix deadlock in receive_writer_thread() and arc_write_ready() In receive_object() a new tx is created in order to create an object. While the tx is assigned and not yet commited direct reclaim occurs via the kmem_cache_alloc() in dnode_create(). This results in an inode being evicted which needs to be removed. The removal process creates another new tx which can't be assigned to the closed txg and the receive_writer thread blocks waiting for the next txg. However, since it's still holding a tx open in the previous txg the entire txg_sync process deadlocks. This is prevented by calling spl_fstrans_mark() which disables direct reclaim from occuring in all memory allocations performed by this process. This covered the offending kmem_cache_alloc() and any other kmem_alloc()'s which occur in the critical region. An analogous situtation can occur in arc_write_ready() and is resolved the same way. Finally, abd_alloc_chunk() is changed to use kmem_flags_convert() which detects if PF_FSTRANS flag set for the process. When set the __GFP_IO and __GFP_IO bits are clearer to prevent reclaim. INFO: task receive_writer:25773 blocked for more than 120 seconds. Call Trace: [] schedule+0x29/0x70 [] cv_wait_common+0x10d/0x130 [spl] [] __cv_wait+0x15/0x20 [spl] [] txg_wait_open+0xe3/0x1b0 [zfs] [] dmu_tx_wait+0x465/0x4d0 [zfs] [] dmu_tx_assign+0xc9/0x700 [zfs] [] zfs_rmnode+0x136/0x440 [zfs] [] zfs_zinactive+0xd8/0x140 [zfs] [] zfs_inactive+0x8b/0x300 [zfs] [] zpl_evict_inode+0x43/0xa0 [zfs] [] evict+0xa7/0x170 [] dispose_list+0x3e/0x50 [] prune_icache_sb+0x163/0x320 [] prune_super+0x143/0x170 [] shrink_slab+0x165/0x300 [] do_try_to_free_pages+0x3c2/0x4e0 [] try_to_free_pages+0xfc/0x180 [] __alloc_pages_nodemask+0x808/0xba0 [] alloc_pages_current+0xaa/0x170 [] new_slab+0x275/0x300 [] __slab_alloc+0x315/0x48f [] kmem_cache_alloc+0x193/0x1d0 [] spl_kmem_cache_alloc+0x99/0x150 [spl] [] dnode_create+0x34/0x280 [zfs] [] dnode_hold_impl+0x7e0/0x9d0 [zfs] [] dmu_object_claim_dnsize+0xa2/0x180 [zfs] [] receive_object.isra.8+0x3ed/0x400 [zfs] [] receive_writer_thread+0x2a8/0x760 [zfs] [] thread_generic_wrapper+0x71/0x80 [spl] [] kthread+0xcf/0xe0 Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 4fd38323185f..b95822ab5e8a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5687,6 +5687,7 @@ arc_write_ready(zio_t *zio) arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); enum zio_compress compress; + fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); @@ -5770,6 +5771,7 @@ arc_write_ready(zio_t *zio) } arc_hdr_verify(hdr, zio->io_bp); + spl_fstrans_unmark(cookie); } static void From 4d17e1ae00e2089a9b3164a3f56f349b64e9e3e4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Aug 2016 20:24:01 +0000 Subject: [PATCH 10/11] Fix cv_timedwait_hires The user space implementation of cv_timedwait_hires() was always passing a relative time to pthread_cond_timedwait() when an absolute time is expected. This was accidentally introduced in commit 206971d2. Replace two magic values with their corresponding preprocessor macro. Signed-off-by: Brian Behlendorf --- lib/libzpool/kernel.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 549dd4c5c21e..7ac505a64b91 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -509,7 +509,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; - ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); + ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; @@ -534,6 +534,7 @@ cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { int error; + struct timeval tv; timestruc_t ts; hrtime_t delta; @@ -546,11 +547,17 @@ cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, if (delta <= 0) return (-1); - ts.tv_sec = delta / NANOSEC; - ts.tv_nsec = delta % NANOSEC; + VERIFY(gettimeofday(&tv, NULL) == 0); + + ts.tv_sec = tv.tv_sec + delta / NANOSEC; + ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); + if (ts.tv_nsec >= NANOSEC) { + ts.tv_sec++; + ts.tv_nsec -= NANOSEC; + } ASSERT(mutex_owner(mp) == curthread); - mp->m_owner = NULL; + mp->m_owner = MTX_INIT; error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); mp->m_owner = curthread; From 8d1da46b2639ddd7e3997530fa3b953ca5211cd7 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Wed, 31 Aug 2016 00:26:43 -0600 Subject: [PATCH 11/11] Added ABD page support to vdev_disk.c Signed-off-by: Isaac Huang --- include/sys/abd.h | 7 ++++ include/sys/spa.h | 1 - module/zfs/abd.c | 62 ++++++++++++++++++++++++++++++++- module/zfs/vdev_disk.c | 79 +++++++++++++----------------------------- 4 files changed, 92 insertions(+), 57 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index bc2cc62a131a..c13785487c38 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -32,6 +32,7 @@ #include #ifdef _KERNEL #include +#include #include #endif @@ -113,6 +114,12 @@ int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); +#if defined(_KERNEL) && defined(HAVE_SPL) +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + /* * Wrappers for calls with offsets of 0 */ diff --git a/include/sys/spa.h b/include/sys/spa.h index a5b82c44d19f..07d5c5d0adea 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -577,7 +577,6 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, nvlist_t *zplprops); -extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); diff --git a/module/zfs/abd.c b/module/zfs/abd.c index dc4c632b87f4..11c55e6a261e 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -459,6 +459,8 @@ abd_alloc_sametype(abd_t *sabd, size_t size) * with vanilla abd_alloc(). * * LINUX ABD TODO - once vdev_disk.c has ABD page support change to vanilla + * - TODO vdev_disk.c now has ABD page support, but some disk label checksumming + * code still assumes linear ABD. */ abd_t * abd_alloc_for_io(size_t size, boolean_t is_metadata) @@ -1006,8 +1008,66 @@ abd_cmp(abd_t *dabd, abd_t *sabd) abd_cmp_cb, NULL)); } - #if defined(_KERNEL) && defined(HAVE_SPL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = abd->abd_u.abd_scatter.abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) + - (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, pgoff, index; + + if (io_size <= 0) + break; + + pgoff = abd_iter_scatter_chunk_offset(&aiter); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + index = abd_iter_scatter_chunk_index(&aiter); + pg = abd->abd_u.abd_scatter.abd_chunks[index]; + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + /* Tunable Parameters */ module_param(zfs_abd_scatter_enabled, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_enabled, diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 40c7ae648fea..a28eb9dc5bd8 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -43,7 +43,6 @@ static void *zfs_vdev_holder = VDEV_HOLDER; */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ - void *dr_loanbuf; /* borrowed abd buffer */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ @@ -404,7 +403,6 @@ vdev_disk_dio_put(dio_request_t *dr) */ if (rc == 0) { zio_t *zio = dr->dr_zio; - void *loanbuf = dr->dr_loanbuf; int error = dr->dr_error; vdev_disk_dio_free(dr); @@ -414,14 +412,6 @@ vdev_disk_dio_put(dio_request_t *dr) ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); - /* ABD placeholder */ - if (loanbuf != NULL) { - if (zio->io_type == ZIO_TYPE_READ) { - abd_copy_from_buf(zio->io_abd, loanbuf, - zio->io_size); - } - zio_buf_free(loanbuf, zio->io_size); - } zio_delay_interrupt(zio); } @@ -446,17 +436,10 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) #endif } - /* Drop reference aquired by __vdev_disk_physio */ + /* Drop reference acquired by __vdev_disk_physio */ rc = vdev_disk_dio_put(dr); } -static inline unsigned long -bio_nr_pages(void *bio_ptr, unsigned int bio_size) -{ - return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> - PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); -} - static unsigned int bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) { @@ -496,6 +479,15 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) return (bio_size); } +static unsigned int +bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) +{ + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); + + return (abd_scatter_bio_map_off(bio, abd, size, off)); +} + #ifndef bio_set_op_attrs #define bio_set_op_attrs(bio, rw, flags) \ do { (bio)->bi_rw |= (rw)|(flags); } while (0) @@ -528,16 +520,17 @@ vdev_submit_bio(struct bio *bio) } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, - size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags) +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t abd_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; - ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + ASSERT(zio != NULL); + ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -556,32 +549,10 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - if (zio != NULL) { - abd_t *abd = zio->io_abd; - - /* - * ABD placeholder - * We can't use abd_borrow_buf routines here since our - * completion context is interrupt and abd refcounts - * take a mutex (in debug mode). - */ - if (abd_is_linear(abd)) { - bio_ptr = abd_to_buf(abd); - dr->dr_loanbuf = NULL; - } else { - bio_ptr = zio_buf_alloc(zio->io_size); - dr->dr_loanbuf = bio_ptr; - if (zio->io_type != ZIO_TYPE_READ) - abd_copy_to_buf(bio_ptr, abd, zio->io_size); - - } - } else { - bio_ptr = kbuf_ptr; - dr->dr_loanbuf = NULL; - } - bio_offset = kbuf_offset; - bio_size = kbuf_size; + abd_offset = 0; + bio_offset = io_offset; + bio_size = io_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -594,8 +565,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * are needed we allocate a larger dio and warn the user. */ if (dr->dr_bio_count == i) { - if (dr->dr_loanbuf) - zio_buf_free(dr->dr_loanbuf, zio->io_size); vdev_disk_dio_free(dr); bio_count *= 2; goto retry; @@ -603,10 +572,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, /* bio_alloc() with __GFP_WAIT never returns NULL */ dr->dr_bio[i] = bio_alloc(GFP_NOIO, - MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); + MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), + BIO_MAX_PAGES)); if (unlikely(dr->dr_bio[i] == NULL)) { - if (dr->dr_loanbuf) - zio_buf_free(dr->dr_loanbuf, zio->io_size); vdev_disk_dio_free(dr); return (ENOMEM); } @@ -621,10 +589,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -756,7 +725,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, NULL, + error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, flags); if (error) { zio->io_error = error;