From 7fc10f0415b04cb8090712dad5535069ff20f6a9 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Wed, 13 Jul 2016 14:29:42 -0700 Subject: [PATCH] 6950 ARC should cache compressed data Reviewed by: Prakash Surya Reviewed by: Dan Kimmel Reviewed by: Matt Ahrens Reviewed by: Paul Dagnelie Reviewed by: Don Brady Reviewed by: Richard Elling This change covers the reading and writing of compressed arc headers, sharing data between the arc_hdr_t and the arc_buf_t, and the implementation of a new dbuf cache to keep frequently access data uncompressed. I've added a new member to l1 arc hdr called b_pdata. The b_pdata always hangs off the arc_buf_hdr_t (if an L1 hdr is in use) and points to the physical block for that DVA. The physical block may or may not be compressed. If compressed arc is enabled and the block on-disk is compressed, then the b_pdata will match the block on-disk and remain compressed in memory. If the block on disk is not compressed, then neither will the b_pdata. Lastly, if compressed arc is disabled, then b_pdata will always be an uncompressed version of the on-disk block. Typically the arc will cache only the arc_buf_hdr_t and will aggressively evict any arc_buf_t's that are no longer referenced. This means that the arc will primarily have compressed blocks as the arc_buf_t's are considered overhead and are always uncompressed. When a consumer reads a block we first look to see if the arc_buf_hdr_t is cached. If the hdr is cached then we allocate a new arc_buf_t and decompress the b_pdata contents into the arc_buf_t's b_data. If the hdr already has a arc_buf_t, then we will allocate an additional arc_buf_t and bcopy the uncompressed contents from the first arc_buf_t to the new one. Writing to the compressed arc requires that we first discard the b_pdata since the physical block is about to be rewritten. The new data contents will be passed in via an arc_buf_t (uncompressed) and during the I/O pipeline stages we will copy the physical block contents to a newly allocated b_pdata. When an l2arc is inuse it will also take advantage of the b_pdata. Now the l2arc will always write the contents of b_pdata to the l2arc. This means that when compressed arc is enabled that the l2arc blocks are identical to those stored in the main data pool. This provides a significant advantage since we can leverage the bp's checksum when reading from the l2arc to determine if the contents are valid. If the compressed arc is disabled, then we must first transform the read block to look like the physical block in the main data pool before comparing the checksum and determining it's valid. Closes #103 --- usr/src/cmd/mdb/common/mdb/mdb_ctf.c | 20 +- usr/src/cmd/mdb/common/mdb/mdb_ctf.h | 3 +- .../mdb/common/modules/conf/mapfile-extern | 3 +- usr/src/cmd/mdb/common/modules/zfs/zfs.c | 429 +- usr/src/cmd/zdb/zdb.c | 2 +- usr/src/cmd/ztest/ztest.c | 7 + .../grub/grub-0.97/stage2/zfs-include/spa.h | 10 +- usr/src/lib/libzpool/common/llib-lzpool | 3 +- usr/src/pkg/manifests/system-test-zfstest.mf | 25 + usr/src/test/test-runner/cmd/run.py | 16 +- usr/src/test/zfs-tests/include/commands.cfg | 9 + usr/src/test/zfs-tests/include/default.cfg | 3 + usr/src/test/zfs-tests/include/libtest.shlib | 24 + usr/src/test/zfs-tests/runfiles/Makefile | 5 +- .../zfs-tests/runfiles/perf-regression.run | 30 + usr/src/test/zfs-tests/tests/Makefile | 4 +- .../clean_mirror/clean_mirror_common.kshlib | 36 +- usr/src/test/zfs-tests/tests/perf/Makefile | 44 + .../test/zfs-tests/tests/perf/fio/Makefile | 41 + .../test/zfs-tests/tests/perf/fio/mkfiles.fio | 30 + .../zfs-tests/tests/perf/fio/random_reads.fio | 31 + .../tests/perf/fio/random_readwrite.fio | 35 + .../tests/perf/fio/random_writes.fio | 33 + .../tests/perf/fio/sequential_reads.fio | 31 + .../tests/perf/fio/sequential_writes.fio | 33 + usr/src/test/zfs-tests/tests/perf/perf.shlib | 240 ++ .../zfs-tests/tests/perf/regression/Makefile | 46 + .../tests/perf/regression/random_reads.ksh | 77 + .../perf/regression/random_readwrite.ksh | 77 + .../tests/perf/regression/random_writes.ksh | 69 + .../perf/regression/sequential_reads.ksh | 78 + .../regression/sequential_reads_cached.ksh | 77 + .../sequential_reads_cached_clone.ksh | 93 + .../perf/regression/sequential_writes.ksh | 69 + .../zfs-tests/tests/perf/regression/setup.ksh | 23 + .../zfs-tests/tests/perf/scripts/Makefile | 37 + .../test/zfs-tests/tests/perf/scripts/io.d | 109 + .../tests/perf/scripts/prefetch_io.d | 87 + usr/src/uts/common/fs/zfs/arc.c | 3533 +++++++++-------- usr/src/uts/common/fs/zfs/dbuf.c | 620 ++- usr/src/uts/common/fs/zfs/dmu.c | 7 +- usr/src/uts/common/fs/zfs/dmu_diff.c | 4 +- usr/src/uts/common/fs/zfs/dmu_objset.c | 15 +- usr/src/uts/common/fs/zfs/dmu_send.c | 8 +- usr/src/uts/common/fs/zfs/dmu_traverse.c | 4 +- usr/src/uts/common/fs/zfs/dnode.c | 2 +- usr/src/uts/common/fs/zfs/dnode_sync.c | 4 +- usr/src/uts/common/fs/zfs/dsl_scan.c | 6 +- usr/src/uts/common/fs/zfs/refcount.c | 24 + usr/src/uts/common/fs/zfs/sys/arc.h | 92 +- usr/src/uts/common/fs/zfs/sys/dbuf.h | 13 +- usr/src/uts/common/fs/zfs/sys/refcount.h | 2 + usr/src/uts/common/fs/zfs/sys/spa.h | 8 +- usr/src/uts/common/fs/zfs/sys/zio.h | 4 + usr/src/uts/common/fs/zfs/sys/zio_checksum.h | 4 + usr/src/uts/common/fs/zfs/zil.c | 5 +- usr/src/uts/common/fs/zfs/zio.c | 10 +- usr/src/uts/common/fs/zfs/zio_checksum.c | 58 +- 58 files changed, 4422 insertions(+), 1990 deletions(-) create mode 100644 usr/src/test/zfs-tests/runfiles/perf-regression.run create mode 100644 usr/src/test/zfs-tests/tests/perf/Makefile create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/Makefile create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio create mode 100644 usr/src/test/zfs-tests/tests/perf/perf.shlib create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/Makefile create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/regression/setup.ksh create mode 100644 usr/src/test/zfs-tests/tests/perf/scripts/Makefile create mode 100644 usr/src/test/zfs-tests/tests/perf/scripts/io.d create mode 100644 usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d diff --git a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c index 66a8b009a3c4..8b8b72fd5362 100644 --- a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c +++ b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -910,6 +910,24 @@ mdb_ctf_offsetof_by_name(const char *type, const char *member) return (off); } +ssize_t +mdb_ctf_sizeof_by_name(const char *type) +{ + mdb_ctf_id_t id; + ssize_t size; + + if (mdb_ctf_lookup_by_name(type, &id) == -1) { + mdb_warn("couldn't find type %s", type); + return (-1); + } + + if ((size = mdb_ctf_type_size(id)) == -1) { + mdb_warn("couldn't determine type size of %s", type); + return (-1); + } + + return (size); +} /*ARGSUSED*/ static int diff --git a/usr/src/cmd/mdb/common/mdb/mdb_ctf.h b/usr/src/cmd/mdb/common/mdb/mdb_ctf.h index 85e60494d0a9..21f27d782b64 100644 --- a/usr/src/cmd/mdb/common/mdb/mdb_ctf.h +++ b/usr/src/cmd/mdb/common/mdb/mdb_ctf.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2015, Joyent, Inc. */ @@ -136,6 +136,7 @@ extern int mdb_ctf_member_info(mdb_ctf_id_t, const char *, extern int mdb_ctf_offsetof(mdb_ctf_id_t, const char *, ulong_t *); extern int mdb_ctf_num_members(mdb_ctf_id_t); extern int mdb_ctf_offsetof_by_name(const char *, const char *); +extern ssize_t mdb_ctf_sizeof_by_name(const char *); extern ssize_t mdb_ctf_offset_to_name(mdb_ctf_id_t, ulong_t, char *, size_t, int, mdb_ctf_id_t *, ulong_t *); diff --git a/usr/src/cmd/mdb/common/modules/conf/mapfile-extern b/usr/src/cmd/mdb/common/modules/conf/mapfile-extern index 2491a7a8d572..73de8f1e413d 100644 --- a/usr/src/cmd/mdb/common/modules/conf/mapfile-extern +++ b/usr/src/cmd/mdb/common/modules/conf/mapfile-extern @@ -1,6 +1,6 @@ # # Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # # CDDL HEADER START # @@ -67,6 +67,7 @@ SYMBOL_SCOPE { mdb_ctf_module_lookup { FLAGS = EXTERN }; mdb_ctf_offsetof { FLAGS = EXTERN }; mdb_ctf_offsetof_by_name { FLAGS = EXTERN }; + mdb_ctf_sizeof_by_name { FLAGS = EXTERN }; mdb_ctf_readsym { FLAGS = EXTERN }; mdb_ctf_type_cmp { FLAGS = EXTERN }; mdb_ctf_type_invalidate { FLAGS = EXTERN }; diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 697349b02003..12f26d382c13 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef _KERNEL #define ZFS_OBJ_NAME "zfs" @@ -973,6 +974,7 @@ arc_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "mfu_ghost_evictable_metadata", "evict_l2_cached", "evict_l2_eligible", "evict_l2_ineligible", "l2_read_bytes", "l2_write_bytes", "l2_size", "l2_asize", "l2_hdr_size", + "compressed_size", "uncompressed_size", "overhead_size", NULL }; @@ -1655,7 +1657,6 @@ metaslab_walk_step(mdb_walk_state_t *wsp) return (wsp->walk_callback(msp, &ms, wsp->walk_cbdata)); } -/* ARGSUSED */ static int metaslab_walk_init(mdb_walk_state_t *wsp) { @@ -2183,6 +2184,69 @@ zio_state(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (mdb_pwalk_dcmd("zio_root", "zio", argc, argv, addr)); } +typedef struct mdb_multilist { + uint64_t ml_num_sublists; + uintptr_t ml_sublists; +} mdb_multilist_t; + +typedef struct multilist_walk_data { + uint64_t mwd_idx; + mdb_multilist_t mwd_ml; +} multilist_walk_data_t; + +/* ARGSUSED */ +static int +multilist_print_cb(uintptr_t addr, const void *unknown, void *arg) +{ + mdb_printf("%#lr\n", addr); + return (WALK_NEXT); +} + +static int +multilist_walk_step(mdb_walk_state_t *wsp) +{ + multilist_walk_data_t *mwd = wsp->walk_data; + + if (mwd->mwd_idx >= mwd->mwd_ml.ml_num_sublists) + return (WALK_DONE); + + wsp->walk_addr = mwd->mwd_ml.ml_sublists + + mdb_ctf_sizeof_by_name("multilist_sublist_t") * mwd->mwd_idx + + mdb_ctf_offsetof_by_name("multilist_sublist_t", "mls_list"); + + mdb_pwalk("list", multilist_print_cb, (void*)NULL, wsp->walk_addr); + mwd->mwd_idx++; + + return (WALK_NEXT); +} + +static int +multilist_walk_init(mdb_walk_state_t *wsp) +{ + multilist_walk_data_t *mwd; + + if (wsp->walk_addr == NULL) { + mdb_warn("must supply address of multilist_t\n"); + return (WALK_ERR); + } + + mwd = mdb_zalloc(sizeof (multilist_walk_data_t), UM_SLEEP | UM_GC); + if (mdb_ctf_vread(&mwd->mwd_ml, "multilist_t", "mdb_multilist_t", + wsp->walk_addr, 0) == -1) { + return (WALK_ERR); + } + + if (mwd->mwd_ml.ml_num_sublists == 0 || + mwd->mwd_ml.ml_sublists == NULL) { + mdb_warn("invalid or uninitialized multilist at %#lx\n", + wsp->walk_addr); + return (WALK_ERR); + } + + wsp->walk_data = mwd; + return (WALK_NEXT); +} + typedef struct txg_list_walk_data { uintptr_t lw_head[TXG_SIZE]; int lw_txgoff; @@ -3269,6 +3333,359 @@ rrwlock(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +typedef struct mdb_arc_buf_hdr_t { + uint16_t b_psize; + uint16_t b_lsize; + struct { + uint32_t b_bufcnt; + uintptr_t b_state; + uintptr_t b_pdata; + } b_l1hdr; +} mdb_arc_buf_hdr_t; + +enum arc_cflags { + ARC_CFLAG_VERBOSE = 1 << 0, + ARC_CFLAG_ANON = 1 << 1, + ARC_CFLAG_MRU = 1 << 2, + ARC_CFLAG_MFU = 1 << 3, + ARC_CFLAG_BUFS = 1 << 4, +}; + +typedef struct arc_compression_stats_data { + GElf_Sym anon_sym; /* ARC_anon symbol */ + GElf_Sym mru_sym; /* ARC_mru symbol */ + GElf_Sym mrug_sym; /* ARC_mru_ghost symbol */ + GElf_Sym mfu_sym; /* ARC_mfu symbol */ + GElf_Sym mfug_sym; /* ARC_mfu_ghost symbol */ + GElf_Sym l2c_sym; /* ARC_l2c_only symbol */ + uint64_t *anon_c_hist; /* histogram of compressed sizes in anon */ + uint64_t *anon_u_hist; /* histogram of uncompressed sizes in anon */ + uint64_t *anon_bufs; /* histogram of buffer counts in anon state */ + uint64_t *mru_c_hist; /* histogram of compressed sizes in mru */ + uint64_t *mru_u_hist; /* histogram of uncompressed sizes in mru */ + uint64_t *mru_bufs; /* histogram of buffer counts in mru */ + uint64_t *mfu_c_hist; /* histogram of compressed sizes in mfu */ + uint64_t *mfu_u_hist; /* histogram of uncompressed sizes in mfu */ + uint64_t *mfu_bufs; /* histogram of buffer counts in mfu */ + uint64_t *all_c_hist; /* histogram of compressed anon + mru + mfu */ + uint64_t *all_u_hist; /* histogram of uncompressed anon + mru + mfu */ + uint64_t *all_bufs; /* histogram of buffer counts in all states */ + int arc_cflags; /* arc compression flags, specified by user */ + int hist_nbuckets; /* number of buckets in each histogram */ +} arc_compression_stats_data_t; + +int +highbit64(uint64_t i) +{ + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* ARGSUSED */ +static int +arc_compression_stats_cb(uintptr_t addr, const void *unknown, void *arg) +{ + arc_compression_stats_data_t *data = arg; + mdb_arc_buf_hdr_t hdr; + int cbucket, ubucket, bufcnt; + + if (mdb_ctf_vread(&hdr, "arc_buf_hdr_t", "mdb_arc_buf_hdr_t", + addr, 0) == -1) { + return (WALK_ERR); + } + + /* + * Headers in the ghost states, or the l2c_only state don't have + * arc buffers linked off of them. Thus, their compressed size + * is meaningless, so we skip these from the stats. + */ + if (hdr.b_l1hdr.b_state == data->mrug_sym.st_value || + hdr.b_l1hdr.b_state == data->mfug_sym.st_value || + hdr.b_l1hdr.b_state == data->l2c_sym.st_value) { + return (WALK_NEXT); + } + + /* + * The physical size (compressed) and logical size + * (uncompressed) are in units of SPA_MINBLOCKSIZE. By default, + * we use the log2 of this value (rounded down to the nearest + * integer) to determine the bucket to assign this header to. + * Thus, the histogram is logarithmic with respect to the size + * of the header. For example, the following is a mapping of the + * bucket numbers and the range of header sizes they correspond to: + * + * 0: 0 byte headers + * 1: 512 byte headers + * 2: [1024 - 2048) byte headers + * 3: [2048 - 4096) byte headers + * 4: [4096 - 8192) byte headers + * 5: [8192 - 16394) byte headers + * 6: [16384 - 32768) byte headers + * 7: [32768 - 65536) byte headers + * 8: [65536 - 131072) byte headers + * 9: 131072 byte headers + * + * If the ARC_CFLAG_VERBOSE flag was specified, we use the + * physical and logical sizes directly. Thus, the histogram will + * no longer be logarithmic; instead it will be linear with + * respect to the size of the header. The following is a mapping + * of the first many bucket numbers and the header size they + * correspond to: + * + * 0: 0 byte headers + * 1: 512 byte headers + * 2: 1024 byte headers + * 3: 1536 byte headers + * 4: 2048 byte headers + * 5: 2560 byte headers + * 6: 3072 byte headers + * + * And so on. Keep in mind that a range of sizes isn't used in + * the case of linear scale because the headers can only + * increment or decrement in sizes of 512 bytes. So, it's not + * possible for a header to be sized in between whats listed + * above. + * + * Also, the above mapping values were calculated assuming a + * SPA_MINBLOCKSHIFT of 512 bytes and a SPA_MAXBLOCKSIZE of 128K. + */ + + if (data->arc_cflags & ARC_CFLAG_VERBOSE) { + cbucket = hdr.b_psize; + ubucket = hdr.b_lsize; + } else { + cbucket = highbit64(hdr.b_psize); + ubucket = highbit64(hdr.b_lsize); + } + + bufcnt = hdr.b_l1hdr.b_bufcnt; + if (bufcnt >= data->hist_nbuckets) + bufcnt = data->hist_nbuckets - 1; + + /* Ensure we stay within the bounds of the histogram array */ + ASSERT3U(cbucket, <, data->hist_nbuckets); + ASSERT3U(ubucket, <, data->hist_nbuckets); + + if (hdr.b_l1hdr.b_state == data->anon_sym.st_value) { + data->anon_c_hist[cbucket]++; + data->anon_u_hist[ubucket]++; + data->anon_bufs[bufcnt]++; + } else if (hdr.b_l1hdr.b_state == data->mru_sym.st_value) { + data->mru_c_hist[cbucket]++; + data->mru_u_hist[ubucket]++; + data->mru_bufs[bufcnt]++; + } else if (hdr.b_l1hdr.b_state == data->mfu_sym.st_value) { + data->mfu_c_hist[cbucket]++; + data->mfu_u_hist[ubucket]++; + data->mfu_bufs[bufcnt]++; + } + + data->all_c_hist[cbucket]++; + data->all_u_hist[ubucket]++; + data->all_bufs[bufcnt]++; + + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +arc_compression_stats(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + arc_compression_stats_data_t data = { 0 }; + unsigned int max_shifted = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; + unsigned int hist_size; + char range[32]; + int rc = DCMD_OK; + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, ARC_CFLAG_VERBOSE, &data.arc_cflags, + 'a', MDB_OPT_SETBITS, ARC_CFLAG_ANON, &data.arc_cflags, + 'b', MDB_OPT_SETBITS, ARC_CFLAG_BUFS, &data.arc_cflags, + 'r', MDB_OPT_SETBITS, ARC_CFLAG_MRU, &data.arc_cflags, + 'f', MDB_OPT_SETBITS, ARC_CFLAG_MFU, &data.arc_cflags) != argc) + return (DCMD_USAGE); + + if (mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_anon", &data.anon_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mru", &data.mru_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mru_ghost", &data.mrug_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mfu", &data.mfu_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_mfu_ghost", &data.mfug_sym) || + mdb_lookup_by_obj(ZFS_OBJ_NAME, "ARC_l2c_only", &data.l2c_sym)) { + mdb_warn("can't find arc state symbol"); + return (DCMD_ERR); + } + + /* + * Determine the maximum expected size for any header, and use + * this to determine the number of buckets needed for each + * histogram. If ARC_CFLAG_VERBOSE is specified, this value is + * used directly; otherwise the log2 of the maximum size is + * used. Thus, if using a log2 scale there's a maximum of 10 + * possible buckets, while the linear scale (when using + * ARC_CFLAG_VERBOSE) has a maximum of 257 buckets. + */ + if (data.arc_cflags & ARC_CFLAG_VERBOSE) + data.hist_nbuckets = max_shifted + 1; + else + data.hist_nbuckets = highbit64(max_shifted) + 1; + + hist_size = sizeof (uint64_t) * data.hist_nbuckets; + + data.anon_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.anon_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.anon_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.mru_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mru_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mru_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.mfu_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mfu_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.mfu_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + data.all_c_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.all_u_hist = mdb_zalloc(hist_size, UM_SLEEP); + data.all_bufs = mdb_zalloc(hist_size, UM_SLEEP); + + if (mdb_walk("arc_buf_hdr_t_full", arc_compression_stats_cb, + &data) != 0) { + mdb_warn("can't walk arc_buf_hdr's"); + rc = DCMD_ERR; + goto out; + } + + if (data.arc_cflags & ARC_CFLAG_VERBOSE) { + rc = mdb_snprintf(range, sizeof (range), + "[n*%llu, (n+1)*%llu)", SPA_MINBLOCKSIZE, + SPA_MINBLOCKSIZE); + } else { + rc = mdb_snprintf(range, sizeof (range), + "[2^(n-1)*%llu, 2^n*%llu)", SPA_MINBLOCKSIZE, + SPA_MINBLOCKSIZE); + } + + if (rc < 0) { + /* snprintf failed, abort the dcmd */ + rc = DCMD_ERR; + goto out; + } else { + /* snprintf succeeded above, reset return code */ + rc = DCMD_OK; + } + + if (data.arc_cflags & ARC_CFLAG_ANON) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of anon buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.anon_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + mdb_printf("Histogram of compressed anon buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.anon_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed anon buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.anon_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_MRU) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of mru buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.mru_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + mdb_printf("Histogram of compressed mru buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mru_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed mru buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mru_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_MFU) { + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of the number of mfu buffers " + "that are associated with an arc hdr.\n"); + dump_histogram(data.mfu_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + mdb_printf("Histogram of compressed mfu buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mfu_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of uncompressed mfu buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.mfu_u_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + if (data.arc_cflags & ARC_CFLAG_BUFS) { + mdb_printf("Histogram of all buffers that " + "are associated with an arc hdr.\n"); + dump_histogram(data.all_bufs, data.hist_nbuckets, 0); + mdb_printf("\n"); + } + + mdb_printf("Histogram of all compressed buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.all_c_hist, data.hist_nbuckets, 0); + mdb_printf("\n"); + + mdb_printf("Histogram of all uncompressed buffers.\n" + "Each bucket represents buffers of size: %s.\n", range); + dump_histogram(data.all_u_hist, data.hist_nbuckets, 0); + +out: + mdb_free(data.anon_c_hist, hist_size); + mdb_free(data.anon_u_hist, hist_size); + mdb_free(data.anon_bufs, hist_size); + + mdb_free(data.mru_c_hist, hist_size); + mdb_free(data.mru_u_hist, hist_size); + mdb_free(data.mru_bufs, hist_size); + + mdb_free(data.mfu_c_hist, hist_size); + mdb_free(data.mfu_u_hist, hist_size); + mdb_free(data.mfu_bufs, hist_size); + + mdb_free(data.all_c_hist, hist_size); + mdb_free(data.all_u_hist, hist_size); + mdb_free(data.all_bufs, hist_size); + + return (rc); +} + /* * MDB module linkage information: * @@ -3339,6 +3756,14 @@ static const mdb_dcmd_t dcmds[] = { "print zfs debug log", dbgmsg}, { "rrwlock", ":", "print rrwlock_t, including readers", rrwlock}, + { "arc_compression_stats", ":[-vabrf]\n" + "\t-v verbose, display a linearly scaled histogram\n" + "\t-a display ARC_anon state statistics individually\n" + "\t-r display ARC_mru state statistics individually\n" + "\t-f display ARC_mfu state statistics individually\n" + "\t-b display histogram of buffer counts\n", + "print a histogram of compressed arc buffer sizes", + arc_compression_stats}, { NULL } }; @@ -3364,6 +3789,8 @@ static const mdb_walker_t walkers[] = { spa_walk_init, spa_walk_step, NULL }, { "metaslab", "given a spa_t *, walk all metaslab_t structures", metaslab_walk_init, metaslab_walk_step, NULL }, + { "multilist", "given a multilist_t *, walk all list_t structures", + multilist_walk_init, multilist_walk_step, NULL }, { "zfs_acl_node", "given a zfs_acl_t, walk all zfs_acl_nodes", zfs_acl_node_walk_init, zfs_acl_node_walk_step, NULL }, { "zfs_acl_node_aces", "given a zfs_acl_node_t, walk all ACEs", diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 812453b826a2..deef7b40848f 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -1264,7 +1264,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index a3800f60ce2d..b21d61d3a7a6 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -187,6 +187,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5378,6 +5379,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } return (NULL); } diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h index 822f67c923f5..a159cec62770 100644 --- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h @@ -23,7 +23,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -81,6 +81,8 @@ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -203,8 +205,10 @@ typedef struct blkptr { #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) #define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index e173e16658d1..d0421bea9423 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* LINTLIBRARY */ @@ -67,3 +67,4 @@ extern uint64_t metaslab_df_alloc_threshold; extern boolean_t zfeature_checks_disable; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 9516cd025410..4ea9b22ac3e9 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -143,6 +143,10 @@ dir path=opt/zfs-tests/tests/functional/zvol/zvol_ENOSPC dir path=opt/zfs-tests/tests/functional/zvol/zvol_cli dir path=opt/zfs-tests/tests/functional/zvol/zvol_misc dir path=opt/zfs-tests/tests/functional/zvol/zvol_swap +dir path=opt/zfs-tests/tests/perf +dir path=opt/zfs-tests/tests/perf/fio +dir path=opt/zfs-tests/tests/perf/regression +dir path=opt/zfs-tests/tests/perf/scripts file path=opt/zfs-tests/README mode=0444 file path=opt/zfs-tests/bin/chg_usr_exec mode=0555 file path=opt/zfs-tests/bin/devname2devid mode=0555 @@ -170,6 +174,7 @@ file path=opt/zfs-tests/include/properties.shlib mode=0555 file path=opt/zfs-tests/runfiles/delphix.run mode=0444 file path=opt/zfs-tests/runfiles/omnios.run mode=0444 file path=opt/zfs-tests/runfiles/openindiana.run mode=0444 +file path=opt/zfs-tests/runfiles/perf-regression.run mode=0444 file path=opt/zfs-tests/tests/functional/acl/acl.cfg mode=0555 file path=opt/zfs-tests/tests/functional/acl/acl_common.kshlib mode=0555 file path=opt/zfs-tests/tests/functional/acl/cifs/cifs.kshlib mode=0555 @@ -2247,8 +2252,28 @@ file path=opt/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_005_pos \ mode=0555 file path=opt/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_006_pos \ mode=0555 +file path=opt/zfs-tests/tests/perf/fio/mkfiles.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_reads.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_readwrite.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/random_writes.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/sequential_reads.fio mode=0444 +file path=opt/zfs-tests/tests/perf/fio/sequential_writes.fio mode=0444 +file path=opt/zfs-tests/tests/perf/perf.shlib mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_reads mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_readwrite mode=0555 +file path=opt/zfs-tests/tests/perf/regression/random_writes mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached \ + mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached_clone \ + mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_writes mode=0555 +file path=opt/zfs-tests/tests/perf/regression/setup mode=0555 +file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0555 +file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0555 license cr_Sun license=cr_Sun license lic_CDDL license=lic_CDDL +#depend fmri=benchmark/fio type=require depend fmri=system/file-system/zfs/tests type=require depend fmri=system/test/testrunner type=require depend fmri=system/xopen/xcu4 type=require diff --git a/usr/src/test/test-runner/cmd/run.py b/usr/src/test/test-runner/cmd/run.py index 44bb9a3f5a29..65a04a241361 100644 --- a/usr/src/test/test-runner/cmd/run.py +++ b/usr/src/test/test-runner/cmd/run.py @@ -27,6 +27,7 @@ from subprocess import PIPE from subprocess import Popen from sys import argv +from sys import maxint from sys import exit from threading import Timer from time import time @@ -144,13 +145,16 @@ class Cmd(object): def __init__(self, pathname, outputdir=None, timeout=None, user=None): self.pathname = pathname self.outputdir = outputdir or 'BASEDIR' - self.timeout = timeout or 60 + self.timeout = timeout self.user = user or '' self.killed = False self.result = Result() + if self.timeout == None: + self.timeout = 60 + def __str__(self): - return "Pathname: %s\nOutputdir: %s\nTimeout: %s\nUser: %s\n" % ( + return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % ( self.pathname, self.outputdir, self.timeout, self.user) def kill_cmd(self, proc): @@ -227,6 +231,10 @@ def run(self, options): try: self.result.starttime = time() proc = Popen(privcmd, stdout=PIPE, stderr=PIPE) + + # Allow a special timeout value of 0 to mean infinity + if int(self.timeout) == 0: + self.timeout = maxint t = Timer(int(self.timeout), self.kill_cmd, [proc]) t.start() self.result.stdout, self.result.stderr = self.collect_output(proc) @@ -315,7 +323,7 @@ def __str__(self): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTimeout: %s\nPre: %s%s\nPost: " \ + return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nPre: %s%s\nPost: " \ "%s%s\nUser: %s\n" % (self.pathname, self.outputdir, self.timeout, self.pre, pre_user, self.post, post_user, self.user) @@ -390,7 +398,7 @@ def __str__(self): pre_user = ' (as %s)' % (self.pre_user) if len(self.post_user): post_user = ' (as %s)' % (self.post_user) - return "Pathname: %s\nOutputdir: %s\nTests: %s\nTimeout: %s\n" \ + return "Pathname: %s\nOutputdir: %s\nTests: %s\nTimeout: %d\n" \ "Pre: %s%s\nPost: %s%s\nUser: %s\n" % (self.pathname, self.outputdir, self.tests, self.timeout, self.pre, pre_user, self.post, post_user, self.user) diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg index c65cafb3dfb1..33a6fe21dd34 100644 --- a/usr/src/test/zfs-tests/include/commands.cfg +++ b/usr/src/test/zfs-tests/include/commands.cfg @@ -41,17 +41,20 @@ export DF="/usr/bin/df" export DIFF="/usr/bin/diff" export DIRCMP="/usr/bin/dircmp" export DIRNAME="/usr/bin/dirname" +export DTRACE="/usr/sbin/dtrace" export DU="/usr/bin/du" export DUMPADM="/usr/sbin/dumpadm" export ECHO="/usr/bin/echo" export EGREP="/usr/bin/egrep" # Don't use $ENV here, because in ksh scripts it evaluates to # $HOME/.kshrc - likely not what you wanted. +export FALSE="/usr/bin/false" export FDISK="/usr/sbin/fdisk" export FF="/usr/sbin/ff" export FGREP="/usr/bin/fgrep" export FILE="/usr/bin/file" export FIND="/usr/bin/find" +export FIO="/usr/bin/fio" export FMADM="/usr/sbin/fmadm" export FMDUMP="/usr/sbin/fmdump" export FORMAT="/usr/sbin/format" @@ -70,6 +73,7 @@ export GROUPS="/usr/bin/groups" export HEAD="/usr/bin/head" export HOSTNAME="/usr/bin/hostname" export ID="/usr/bin/id" +export IOSTAT="/usr/bin/iostat" export ISAINFO="/usr/bin/isainfo" export KILL="/usr/bin/kill" export KSH="/usr/bin/ksh" @@ -88,6 +92,7 @@ export MKNOD="/usr/sbin/mknod" export MODINFO="/usr/sbin/modinfo" export MODUNLOAD="/usr/sbin/modunload" export MOUNT="/usr/sbin/mount" +export MPSTAT="/usr/bin/mpstat" export MV="/usr/bin/mv" export NAWK="/usr/bin/nawk" export NCHECK="/usr/sbin/ncheck" @@ -108,6 +113,7 @@ export PSRINFO="/usr/sbin/psrinfo" export PWD="/usr/bin/pwd" export PYTHON="/usr/bin/python" export QUOTAON="/usr/sbin/quotaon" +export READLINK="/usr/bin/readlink" export RCP="/usr/bin/rcp" export REBOOT="/usr/sbin/reboot" export RM="/usr/bin/rm" @@ -130,6 +136,7 @@ export SWAPADD="/sbin/swapadd" export SYNC="/usr/bin/sync" export TAIL="/usr/bin/tail" export TAR="/usr/sbin/tar" +export TIMEOUT="/usr/bin/timeout" export TOUCH="/usr/bin/touch" export TR="/usr/bin/tr" export TRUNCATE="/usr/bin/truncate" @@ -148,11 +155,13 @@ export UNSHARE="/usr/sbin/unshare" export USERADD="/usr/sbin/useradd" export USERDEL="/usr/sbin/userdel" export USERMOD="/usr/sbin/usermod" +export VMSTAT="/usr/bin/vmstat" export WAIT="/usr/bin/wait" export WC="/usr/bin/wc" export ZDB="/usr/sbin/zdb" export ZFS="/usr/sbin/zfs" export ZHACK="/usr/sbin/zhack" +export ZINJECT="/usr/sbin/zinject" export ZLOGIN="/usr/sbin/zlogin" export ZLOOK="/usr/bin/zlook" export ZONEADM="/usr/sbin/zoneadm" diff --git a/usr/src/test/zfs-tests/include/default.cfg b/usr/src/test/zfs-tests/include/default.cfg index 61fb25e628d5..faa1c108ef72 100644 --- a/usr/src/test/zfs-tests/include/default.cfg +++ b/usr/src/test/zfs-tests/include/default.cfg @@ -81,12 +81,15 @@ export COMPRESSION_PROP=on export CHECKSUM_PROP=on # some common variables used by test scripts : +export FIO_SCRIPTS=$STF_SUITE/tests/perf/fio +export PERF_SCRIPTS=$STF_SUITE/tests/perf/scripts # some test pool names export TESTPOOL=testpool.$$ export TESTPOOL1=testpool1.$$ export TESTPOOL2=testpool2.$$ export TESTPOOL3=testpool3.$$ +export PERFPOOL=perfpool # some test file system names export TESTFS=testfs.$$ diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index 30f10fcf9f89..ded6ae48950f 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -2446,3 +2446,27 @@ function vdevs_in_pool return 0; } + +function get_max +{ + typeset -l i max=$1 + shift + + for i in "$@"; do + max=$(echo $((max > i ? max : i))) + done + + echo $max +} + +function get_min +{ + typeset -l i min=$1 + shift + + for i in "$@"; do + min=$(echo $((min < i ? min : i))) + done + + echo $min +} diff --git a/usr/src/test/zfs-tests/runfiles/Makefile b/usr/src/test/zfs-tests/runfiles/Makefile index 0721285dba5a..039a0caff3af 100644 --- a/usr/src/test/zfs-tests/runfiles/Makefile +++ b/usr/src/test/zfs-tests/runfiles/Makefile @@ -16,7 +16,10 @@ include $(SRC)/Makefile.master -SRCS = delphix.run openindiana.run omnios.run +SRCS = delphix.run \ + openindiana.run \ + omnios.run \ + perf-regression.run ROOTOPTPKG = $(ROOT)/opt/zfs-tests RUNFILES = $(ROOTOPTPKG)/runfiles diff --git a/usr/src/test/zfs-tests/runfiles/perf-regression.run b/usr/src/test/zfs-tests/runfiles/perf-regression.run new file mode 100644 index 000000000000..0095931ad5f0 --- /dev/null +++ b/usr/src/test/zfs-tests/runfiles/perf-regression.run @@ -0,0 +1,30 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 0 +post_user = root +post = cleanup +outputdir = /var/tmp/test_results + +[/opt/zfs-tests/tests/perf/regression] +tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_cached', + 'sequential_reads_cached_clone', 'random_reads', 'random_writes', + 'random_readwrite'] +post = diff --git a/usr/src/test/zfs-tests/tests/Makefile b/usr/src/test/zfs-tests/tests/Makefile index 5502e74dfd04..0f569c9400ae 100644 --- a/usr/src/test/zfs-tests/tests/Makefile +++ b/usr/src/test/zfs-tests/tests/Makefile @@ -10,11 +10,11 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # .PARALLEL: $(SUBDIRS) -SUBDIRS = functional stress +SUBDIRS = functional perf include $(SRC)/test/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib b/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib index 3fd6c02aa1b2..91a6d132d2e6 100644 --- a/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2013, 2015 by Delphix. All rights reserved. # . $STF_SUITE/tests/functional/clean_mirror/default.cfg @@ -36,6 +36,32 @@ # the contents of the mirror. # This code is sourced into each of these test cases. +# +# Synchronize all the data in pool +# +# $1 pool name +# +function sync_pool #pool +{ + typeset pool=$1 + + log_must $SYNC + log_must $SLEEP 2 + # Flush all the pool data. + typeset -i ret + $ZPOOL scrub $pool >/dev/null 2>&1 + ret=$? + (( $ret != 0 )) && \ + log_fail "$ZPOOL scrub $pool failed." + + while ! is_pool_scrubbed $pool; do + if is_pool_resilvered $pool ; then + log_fail "$pool should not be resilver completed." + fi + log_must $SLEEP 2 + done +} + function overwrite_verify_mirror { typeset AFFECTED_DEVICE=$1 @@ -60,6 +86,12 @@ function overwrite_verify_mirror atfile=0 + # + # Flush out the cache so that we ensure we're reading from disk. + # + log_must $ZPOOL export $TESTPOOL + log_must $ZPOOL import $TESTPOOL + typeset -i failedcount=0 while (( atfile < FILE_COUNT )); do files[$atfile]=$TESTDIR/file.$atfile @@ -75,4 +107,6 @@ function overwrite_verify_mirror log_fail "of the $FILE_COUNT files $failedcount did not " \ "have the same checksum before and after." fi + + sync_pool $TESTPOOL } diff --git a/usr/src/test/zfs-tests/tests/perf/Makefile b/usr/src/test/zfs-tests/tests/perf/Makefile new file mode 100644 index 000000000000..7886eabad6cc --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/Makefile @@ -0,0 +1,44 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +.PARALLEL: $(SUBDIRS) + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf + +PROGS = perf.shlib + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) + +SUBDIRS = fio \ + regression \ + scripts + +include $(SRC)/test/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/perf/fio/Makefile b/usr/src/test/zfs-tests/tests/perf/fio/Makefile new file mode 100644 index 000000000000..012e286de12e --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/fio + +FILES = mkfiles.fio \ + random_reads.fio \ + random_readwrite.fio \ + random_writes.fio \ + sequential_reads.fio \ + sequential_writes.fio + +CMDS = $(FILES:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0444 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio new file mode 100644 index 000000000000..f876bd63d330 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio @@ -0,0 +1,30 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +ioengine=psync +bs=1024k +rw=write +thread=1 +directory=/${TESTFS} +numjobs=${NUMJOBS} +filesize=${FILE_SIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio new file mode 100644 index 000000000000..25dd2ff838df --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_reads.fio @@ -0,0 +1,31 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=randread +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio new file mode 100644 index 000000000000..0b750260ff96 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +nrfiles=16 +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=randrw +rwmixread=80 +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bssplit=4k/50:8k/30:128k/10:1m/10 +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio new file mode 100644 index 000000000000..b1860a71dd37 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +thread=1 +rw=randwrite +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +filesize=${FILESIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio new file mode 100644 index 000000000000..b7d9fea5f374 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_reads.fio @@ -0,0 +1,31 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +overwrite=0 +thread=1 +rw=read +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio new file mode 100644 index 000000000000..df1590cf11dc --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +[global] +filename_format=file$jobnum +group_reporting=1 +fallocate=0 +thread=1 +rw=write +time_based=1 +directory=/${TESTFS} +runtime=${RUNTIME} +bs=${BLOCKSIZE} +ioengine=psync +sync=${SYNC_TYPE} +numjobs=${NUMJOBS} +filesize=${FILESIZE} +buffer_compress_percentage=33 +buffer_compress_chunk=4096 + +[job] diff --git a/usr/src/test/zfs-tests/tests/perf/perf.shlib b/usr/src/test/zfs-tests/tests/perf/perf.shlib new file mode 100644 index 000000000000..2b4d0433496a --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/perf.shlib @@ -0,0 +1,240 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# If neither is specified, do a nightly run. +[[ -z $PERF_REGRESSION_WEEKLY ]] && export PERF_REGRESSION_NIGHTLY=1 + +# Default runtime for each type of test run. +export PERF_RUNTIME_WEEKLY=$((30 * 60)) +export PERF_RUNTIME_NIGHTLY=$((10 * 60)) + +# Default fs creation options +export PERF_FS_OPTS=${PERF_FS_OPTS:-'-o recsize=8k -o compress=lz4' \ + ' -o checksum=sha256 -o redundant_metadata=most'} + +function get_sync_str +{ + typeset sync=$1 + typeset sync_str='' + + [[ $sync -eq 0 ]] && sync_str='async' + [[ $sync -eq 1 ]] && sync_str='sync' + echo $sync_str +} + +# +# This function will run fio in a loop, according to the .fio file passed +# in and a number of environment variables. The following variables can be +# set before launching zfstest to override the defaults. +# +# PERF_RUNTIME: The time in seconds each fio invocation should run. +# PERF_RUNTYPE: A human readable tag that appears in logs. The defaults are +# nightly and weekly. +# PERF_NTHREADS: A list of how many threads each fio invocation will use. +# PERF_SYNC_TYPES: Whether to use (O_SYNC) or not. 1 is sync IO, 0 is async IO. +# PERF_IOSIZES: A list of blocksizes in which each fio invocation will do IO. +# PERF_COLLECT_SCRIPTS: A comma delimited list of 'command args, logfile_tag' +# pairs that will be added to the scripts specified in each test. +# +function do_fio_run +{ + typeset script=$1 + typeset do_recreate=$2 + typeset clear_cache=$3 + typeset threads sync iosize + + for threads in $PERF_NTHREADS; do + for sync in $PERF_SYNC_TYPES; do + for iosize in $PERF_IOSIZES; do + log_note "Running with $threads" \ + "$(get_sync_str $sync) threads, $iosize ios" + + if $do_recreate; then + recreate_perfpool + log_must $ZFS create $PERF_FS_OPTS \ + $TESTFS + fi + + if $clear_cache; then + # Clear the ARC + $ZPOOL export $PERFPOOL + $ZPOOL import $PERFPOOL + fi + + export RUNTIME=$PERF_RUNTIME + export FILESIZE=$((TOTAL_SIZE / threads)) + export NUMJOBS=$threads + export SYNC_TYPE=$sync + export BLOCKSIZE=$iosize + $SYNC + + # Start the data collection + do_collect_scripts $threads $sync $iosize + + # Start the load + log_must $FIO $FIO_SCRIPTS/$script + done + done + done +} + +# +# This function iterates through the value pairs in $PERF_COLLECT_SCRIPTS. +# The script at index N is launched in the background, with its output +# redirected to a logfile containing the tag specified at index N + 1. +# +function do_collect_scripts +{ + typeset threads=$1 + typeset sync=$2 + typeset iosize=$3 + + [[ -n $collect_scripts ]] || log_fail "No data collection scripts." + [[ -n $PERF_RUNTIME ]] || log_fail "No runtime specified." + + # This will be part of the output filename. + typeset sync_str=$(get_sync_str $sync) + typeset suffix="$sync_str.$iosize-ios.$threads-threads" + + # Add in user supplied scripts and logfiles, if any. + typeset oIFS=$IFS + IFS=',' + for item in $PERF_COLLECT_SCRIPTS; do + collect_scripts+=($($ECHO $item | $SED 's/^ *//g')) + done + IFS=$oIFS + + typeset idx=0 + while [[ $idx -lt "${#collect_scripts[@]}" ]]; do + typeset logbase="$(get_perf_output_dir)/$($BASENAME \ + $SUDO_COMMAND)" + typeset outfile="$logbase.${collect_scripts[$idx + 1]}.$suffix" + + $TIMEOUT $PERF_RUNTIME ${collect_scripts[$idx]} >$outfile 2>&1 & + ((idx += 2)) + done + + # Need to explicitly return 0 because timeout(1) will kill + # a child process and cause us to return non-zero. + return 0 +} + +# Find a place to deposit performance data collected while under load. +function get_perf_output_dir +{ + typeset dir="$(pwd)/perf_data" + [[ -d $dir ]] || $MKDIR -p $dir + + $ECHO $dir +} + +# +# Destroy and create the pool used for performance tests. The +# PERFPOOL_CREATE_CMD variable allows users to test with a custom pool +# configuration by specifying the pool creation command in their environment. +# If PERFPOOL_CREATE_CMD is empty, a pool using all available disks is created. +# +function recreate_perfpool +{ + [[ -n $PERFPOOL ]] || log_fail "The \$PERFPOOL variable isn't set." + + poolexists $PERFPOOL && destroy_pool $PERFPOOL + + if [[ -n $PERFPOOL_CREATE_CMD ]]; then + log_must $PERFPOOL_CREATE_CMD + else + log_must eval "$ZPOOL create -f $PERFPOOL $DISKS" + fi +} + +function get_max_arc_size +{ + typeset -l max_arc_size=$(dtrace -qn 'BEGIN { + printf("%u\n", `arc_stats.arcstat_c_max.value.ui64); + exit(0); + }') + + [[ $? -eq 0 ]] || log_fail "get_max_arc_size failed" + + echo $max_arc_size +} + +# Create a file with some information about how this system is configured. +function get_system_config +{ + typeset config=$PERF_DATA_DIR/$1 + + echo "{" >>$config + $DTRACE -qn 'BEGIN{ + printf(" \"ncpus\": %d,\n", `ncpus); + printf(" \"physmem\": %u,\n", `physmem * `_pagesize); + printf(" \"c_max\": %u,\n", `arc_stats.arcstat_c_max.value.ui64); + printf(" \"kmem_flags\": \"0x%x\",", `kmem_flags); + exit(0)}' >>$config + $ECHO " \"hostname\": \"$($UNAME -n)\"," >>$config + $ECHO " \"kernel version\": \"$($UNAME -v)\"," >>$config + $IOSTAT -En | $AWK 'BEGIN { + printf(" \"disks\": {\n"); first = 1} + /^c/ {disk = $1} + /^Size: [^0]/ {size = $2; + if (first != 1) {printf(",\n")} else {first = 0} + printf(" \"%s\": \"%s\"", disk, size)} + END {printf("\n },\n")}' >>$config + $SED -n 's/^set \(.*\)[ ]=[ ]\(.*\)/\1=\2/p' /etc/system | \ + $AWK -F= 'BEGIN {printf(" \"system\": {\n"); first = 1} + {if (first != 1) {printf(",\n")} else {first = 0}; + printf(" \"%s\": %s", $1, $2)} + END {printf("\n }\n")}' >>$config + echo "}" >>$config +} + +function num_jobs_by_cpu +{ + typeset ncpu=$($PSRINFO | $WC -l) + typeset num_jobs=$ncpu + + [[ $ncpu -gt 8 ]] && num_jobs=$($ECHO "$ncpu * 3 / 4" | $BC) + + $ECHO $num_jobs +} + +function pool_to_lun_list +{ + typeset pool=$1 + typeset ctd ctds devname lun + typeset lun_list=':' + + ctds=$($ZPOOL list -v $pool | $AWK '/c[0-9]*t[0-9a-fA-F]*d[0-9]*/ \ + {print $1}') + + for ctd in $ctds; do + # Get the device name as it appears in /etc/path_to_inst + devname=$($READLINK -f /dev/dsk/${ctd}s0 | $SED -n \ + 's/\/devices\([^:]*\):.*/\1/p') + # Add a string composed of the driver name and instance + # number to the list for comparison with dev_statname. + lun=$($SED 's/"//g' /etc/path_to_inst | $GREP $devname | $AWK \ + '{print $3$2}') + lun_list="$lun_list$lun:" + done + echo $lun_list +} + +# Create a perf_data directory to hold performance statistics and +# configuration information. +export PERF_DATA_DIR=$(get_perf_output_dir) +[[ -f $PERF_DATA_DIR/config.json ]] || get_system_config config.json diff --git a/usr/src/test/zfs-tests/tests/perf/regression/Makefile b/usr/src/test/zfs-tests/tests/perf/regression/Makefile new file mode 100644 index 000000000000..1b4aa2befeb9 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/Makefile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/regression + +PROGS = random_reads \ + random_readwrite \ + random_writes \ + sequential_reads \ + sequential_reads_cached \ + sequential_reads_cached_clone \ + sequential_writes \ + setup + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: %.ksh + $(INS.rename) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh new file mode 100644 index 000000000000..2395b89183ae --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is cleared with `zinject -a` prior to each run +# so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random reads with $PERF_RUNTYPE settings" +do_fio_run random_reads.fio $FALSE $TRUE +log_pass "Measure IO stats during random read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh new file mode 100644 index 000000000000..be87e433195d --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_readwrite job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read and write from are created prior to the first fio run, +# and used for all fio runs. The ARC is cleared with `zinject -a` prior to +# each run so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random read-write load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES='' # bssplit used instead +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES='' # bssplit used instead +fi + +# Layout the files to be used by the readwrite tests. Create as many files +# as the largest number of threads. An fio run with fewer threads will use +# a subset of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random reads and writes with $PERF_RUNTYPE settings" +do_fio_run random_readwrite.fio $FALSE $TRUE +log_pass "Measure IO stats during random read and write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh new file mode 100644 index 000000000000..a4469c32a8b0 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh @@ -0,0 +1,69 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the random_writes job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# Prior to each fio run the dataset is recreated, and fio writes new files +# into an otherwise empty pool. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during random write load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +fi + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Random writes with $PERF_RUNTYPE settings" +do_fio_run random_writes.fio $TRUE $FALSE +log_pass "Measure IO stats during random write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh new file mode 100644 index 000000000000..b04d06c9394b --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh @@ -0,0 +1,78 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is cleared with `zinject -a` prior to each run +# so reads will go to disk. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential reads with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $TRUE +log_pass "Measure IO stats during sequential read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh new file mode 100644 index 000000000000..70cddd4eedbc --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh @@ -0,0 +1,77 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is not cleared to ensure that all data is cached. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Make sure the working set can be cached in the arc. Aim for 1/2 of arc. +export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential cached reads with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $FALSE +log_pass "Measure IO stats during sequential cached read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh new file mode 100644 index 000000000000..c4790f1fc413 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh @@ -0,0 +1,93 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. This test will exercise cached read performance from +# a clone filesystem. The data is initially cached in the ARC and then +# a snapshot and clone are created. All the performance runs are then +# initiated against the clone filesystem to exercise the performance of +# reads when the ARC has to create another buffer from a different dataset. +# It will also exercise the need to evict the duplicate buffer once the last +# reference on that buffer is released. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Make sure the working set can be cached in the arc. Aim for 1/2 of arc. +export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must $FIO $FIO_SCRIPTS/mkfiles.fio + +log_note "Creating snapshot, $TESTSNAP, of $TESTFS" +create_snapshot $TESTFS $TESTSNAP +log_note "Creating clone, $PERFPOOL/$TESTCLONE, from $TESTFS@$TESTSNAP" +create_clone $TESTFS@$TESTSNAP $PERFPOOL/$TESTCLONE + +# +# Reset the TESTFS to point to the clone +# +export TESTFS=$PERFPOOL/$TESTCLONE + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "$VMSTAT 1" "vmstat" + "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential cached reads from $TESTFS with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio $FALSE $FALSE +log_pass "Measure IO stats during sequential cached read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh new file mode 100644 index 000000000000..5e587e864159 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh @@ -0,0 +1,69 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_writes job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# Prior to each fio run the dataset is recreated, and fio writes new files +# into an otherwise empty pool. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +log_assert "Measure IO stats during sequential write load" +log_onexit cleanup + +function cleanup +{ + log_must $ZFS destroy $TESTFS +} + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must $ZFS create $PERF_FS_OPTS $TESTFS + +# Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. +export TOTAL_SIZE=$(($(get_prop avail $TESTFS) * 3 / 2)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'8 16'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k 128k 1m'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'8k 128k 1m'} +fi + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" + "$VMSTAT 1" "vmstat" "$MPSTAT 1" "mpstat" "$IOSTAT -xcnz 1" "iostat") + +log_note "Sequential writes with $PERF_RUNTYPE settings" +do_fio_run sequential_writes.fio $TRUE $FALSE +log_pass "Measure IO stats during sequential write load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh b/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh new file mode 100644 index 000000000000..1206bbdaac6b --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/setup.ksh @@ -0,0 +1,23 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" +verify_disk_count "$DISKS" 3 + +log_pass diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/Makefile b/usr/src/test/zfs-tests/tests/perf/scripts/Makefile new file mode 100644 index 000000000000..aad3dd594361 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/Makefile @@ -0,0 +1,37 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TESTDIR = $(ROOTOPTPKG)/tests/perf/scripts + +PROGS = io.d \ + prefetch_io.d + +CMDS = $(PROGS:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all lint clean clobber: + +install: $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/io.d b/usr/src/test/zfs-tests/tests/perf/scripts/io.d new file mode 100644 index 000000000000..bbcbf8dc54c0 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/io.d @@ -0,0 +1,109 @@ +#!/usr/sbin/dtrace -s + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +/* + * time: Seconds since the epoch + * @ops: The number of reads and writes per interval + * @bytes: Bytes read and written per interval + * @latencies: Mean read and write latency per interval in ns + * These aggregations are indexed with read/write for back end + * statistics and zfs_read/zfs_write for ZPL level statistics. + */ + +#pragma D option aggsortkey +#pragma D option quiet + +BEGIN +{ + @ops["read"] = count(); + @ops["write"] = count(); + @ops["zfs_read"] = count(); + @ops["zfs_write"] = count(); + @latencies["read"] = avg(0); + @latencies["write"] = avg(0); + @latencies["zfs_read"] = avg(0); + @latencies["zfs_write"] = avg(0); + @bytes["read"] = sum(0); + @bytes["write"] = sum(0); + @bytes["zfs_read"] = sum(0); + @bytes["zfs_write"] = sum(0); + clear(@ops); + clear(@latencies); + clear(@bytes); +} + +fbt:zfs:zfs_read:entry, +fbt:zfs:zfs_write:entry +{ + this->zp = (znode_t *)args[0]->v_data; + this->poolname = stringof(this->zp->z_zfsvfs->z_os->os_spa->spa_name); +} + +fbt:zfs:zfs_read:entry, +fbt:zfs:zfs_write:entry +/ this->poolname == $$1 / +{ + self->ts = timestamp; + @ops[probefunc] = count(); + @bytes[probefunc] = sum(args[1]->uio_resid); +} + +fbt:zfs:zfs_read:return, +fbt:zfs:zfs_write:return +/ self->ts != 0 / +{ + @latencies[probefunc] = avg(timestamp - self->ts); + self->ts = 0; +} + +io:::start +/ strstr($$2, args[1]->dev_statname) != NULL / +{ + start[args[0]->b_edev, args[0]->b_blkno] = timestamp; +} + +io:::done +/ start[args[0]->b_edev, args[0]->b_blkno] / +{ + this->elapsed = timestamp - start[args[0]->b_edev, args[0]->b_blkno]; + this->name = args[0]->b_flags & B_READ ? "read" : "write"; + @ops[this->name] = count(); + @bytes[this->name] = sum(args[0]->b_bcount); + @latencies[this->name] = avg(this->elapsed); + start[args[0]->b_edev, args[0]->b_blkno] = 0; +} + +tick-$3s +{ + printf("%u\n", `time); + printa("ops_%-21s%@u\n", @ops); + printa("bytes_%-21s%@u\n", @bytes); + printa("latencies_%-21s%@u\n", @latencies); + + clear(@ops); + clear(@bytes); + clear(@latencies); +} + +ERROR +{ + trace(arg1); + trace(arg2); + trace(arg3); + trace(arg4); + trace(arg5); +} diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d new file mode 100644 index 000000000000..3689823362ec --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d @@ -0,0 +1,87 @@ +#!/usr/sbin/dtrace -Cs + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +/* + * prefetch_ios: Number of IOs the prefetcher issued + * @pf["prefetched_demand_reads"]: Number of demand reads already prefetched + * @pf["sync_wait_for_async"]: Number of times sync IO waited for prefetch IO + * @pf["demand"]: Number of non-prefetch read IOs + * @pf["logical"]: Logical (uncompressed) bytes read per interval + * @pf["physical"]: Physical (compressed) bytes read per interval + */ + +#pragma D option aggsortkey +#pragma D option quiet + +#define SPA_MINBLOCKSHIFT 9 +#define ARC_FLAGS_PREFETCH (1 << 3) +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + +BEGIN +{ + prefetch_ios = `arc_stats.arcstat_prefetch_data_misses.value.ui64; + prefetch_ios += `arc_stats.arcstat_prefetch_metadata_misses.value.ui64; + @pf["demand"] = sum(0); + @pf["logical"] = sum(0); + @pf["physical"] = sum(0); + @pf["prefetched_demand_reads"] = count(); + @pf["sync_wait_for_async"] = count(); + clear(@pf); +} + +arc_read:arc-demand-hit-predictive-prefetch +{ + @pf["prefetched_demand_reads"] = count(); +} + +arc_read:arc-sync-wait-for-async +{ + @pf["sync_wait_for_async"] = count(); +} + +arc_read_done:entry +/ args[0]->io_spa->spa_name == $$1 / +{ + this->zio = args[0]; + this->buf = (arc_buf_t *)this->zio->io_private; + this->hdr = this->buf->b_hdr; + @pf["demand"] = sum(this->hdr->b_flags & ARC_FLAGS_PREFETCH ? 0 : 1); + @pf["logical"] = sum(HDR_GET_LSIZE(this->hdr)); + @pf["physical"] = sum(HDR_GET_PSIZE(this->hdr)); +} + +tick-$2s +{ + this->new_prefetch_ios = + `arc_stats.arcstat_prefetch_data_misses.value.ui64 + + `arc_stats.arcstat_prefetch_metadata_misses.value.ui64; + printf("%u\n%-24s\t%u\n", `time, "prefetch_ios", + this->new_prefetch_ios - prefetch_ios); + printa("%-24s\t%@u\n", @pf); + prefetch_ios = this->new_prefetch_ios; + clear(@pf); +} + +ERROR +{ + trace(arg1); + trace(arg2); + trace(arg3); + trace(arg4); + trace(arg5); +} diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 3c3cbdf4c180..a7b4d05f7711 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -120,9 +120,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include #include +#include #include +#include #include #include #include @@ -151,10 +276,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - uint_t arc_reduce_dnlc_percent = 3; /* @@ -230,9 +351,10 @@ uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +boolean_t zfs_compressed_arc_enabled = B_TRUE; + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -273,7 +395,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -337,6 +459,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually @@ -482,20 +624,13 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; @@ -537,6 +672,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -570,20 +708,13 @@ static arc_stats_t arc_stats = { { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, @@ -656,8 +787,12 @@ static arc_state_t *arc_l2c_only; #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; @@ -717,6 +852,7 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing @@ -727,9 +863,10 @@ typedef struct l1arc_buf_hdr { #endif arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; @@ -742,8 +879,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; @@ -752,9 +888,6 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -763,20 +896,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; @@ -784,9 +934,6 @@ struct arc_buf_hdr { l1arc_buf_hdr_t b_l1hdr; }; -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -795,25 +942,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -866,16 +1023,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -917,12 +1064,10 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { @@ -934,7 +1079,7 @@ typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -942,21 +1087,22 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -974,14 +1120,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -1003,7 +1149,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -1041,13 +1187,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -1075,12 +1221,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1166,7 +1312,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1180,7 +1326,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1253,166 +1399,138 @@ buf_init(void) } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); +#define ARC_MINTIME (hz>>4) /* 62 ms */ -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); +} - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; + + void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } - return (equal); + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - NULL, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -1451,7 +1569,7 @@ arc_buf_watch(arc_buf_t *buf) procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = buf->b_hdr->b_size; + ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); @@ -1462,11 +1580,14 @@ arc_buf_watch(arc_buf_t *buf) static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1488,29 +1609,29 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_thawed != NULL) - kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); - buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1518,95 +1639,282 @@ arc_buf_thaw(arc_buf_t *buf) void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); - arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { - /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} - multilist_remove(list, hdr); +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); } - /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); } -static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) { - int cnt; - arc_state_t *state = hdr->b_l1hdr.b_state; + uint64_t size; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); - ASSERT(!GHOST_STATE(state)); + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} - /* - * arc_l2c_only counts as a ghost state so we don't need to explicitly - * check to prevent usage of the arc_l2c_only list. - */ - if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && - (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); - multilist_insert(list, hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); } - return (cnt); } /* - * Move the supplied buffer to the indicated state. The hash lock - * for the buffer must be held by the caller. + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { - arc_state_t *old_state; - int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; - arc_buf_contents_t buftype = arc_buf_type(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + + arc_state_t *state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); + } + /* remove the prefetch flag if we get a reference */ + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + } +} + +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ +static int +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && + (state != arc_anon)) { + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The hash lock + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) +{ + arc_state_t *old_state; + int64_t refcnt; + uint32_t bufcnt; + boolean_t update_old, update_new; + arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -1618,20 +1926,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1639,25 +1947,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; /* * An L1 header always exists here, since if we're @@ -1668,38 +1968,38 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1708,34 +2008,53 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3P(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1744,9 +2063,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3P(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -1824,39 +2163,85 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} + +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -1873,7 +2258,7 @@ arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -1887,12 +2272,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -1901,244 +2286,406 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - atomic_add_64(&arc_loaned_bytes, hdr->b_size); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) +static void +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_type = type; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); +} + +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ +static void +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); - } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) +static void +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; + + /* + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. + */ + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); +} + +/* + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + */ +static void +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) +{ + arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); + + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ + if (buf->b_data != NULL) { + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_cksum_verify(buf); + arc_buf_unwatch(buf); + + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); + } + buf->b_data = NULL; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; + } + + /* only remove the buf if requested */ + if (!remove) return; + + /* remove the buf from the hdr list */ + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) { - l2arc_data_free_t *df; + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); } /* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. */ -static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { - arc_buf_hdr_t *hdr = buf->b_hdr; + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); - } -} + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); -static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * The header has been reallocated so we need to re-insert it into any + * lists it was on. */ - if (!HDR_HAS_L1HDR(hdr)) - return; + (void) buf_hash_insert(nhdr, NULL); - /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. - */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); /* - * The header does not have compression enabled. This can be due - * to the buffer not being compressible, or because we're - * freeing the buffer before the second phase of - * l2arc_write_buffer() has started (which does the compression - * step). In either case, b_tmp_cdata does not point to a - * separately compressed buffer, so there's nothing to free (it - * points to the same buffer as the arc_buf_t's b_data field). + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; - } + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size, zio_data_buf_free); + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - hdr->b_l1hdr.b_tmp_cdata = NULL; + return (nhdr); } /* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. */ -static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) { - arc_buf_t **bufp; - - /* free up data associated with the buf */ - if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); - - arc_cksum_verify(buf); - arc_buf_unwatch(buf); - - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); - } - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; - - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } - - (void) refcount_remove_many(&state->arcs_size, size, buf); - buf->b_data = NULL; - - /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. - */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; - } - - /* only remove the buf if requested */ - if (!remove) - return; - - /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; - buf->b_next = NULL; - - ASSERT(buf->b_efunc == NULL); - - /* clean up the buf */ - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); } static void @@ -2146,50 +2693,20 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); - /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() - */ - arc_buf_l2_cdata_free(hdr); - - /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). - */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); - - /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. - */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2197,13 +2714,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2227,40 +2747,22 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } - } #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); + } } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -2274,133 +2776,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } int32_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2427,11 +2831,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. it's b_tmp_cdata field) during it's write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. @@ -2442,11 +2846,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2459,6 +2865,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2478,7 +2885,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2486,37 +2892,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -2760,12 +3168,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -2777,7 +3185,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -2801,8 +3209,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -3065,36 +3473,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3114,9 +3499,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3281,7 +3663,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3369,6 +3751,20 @@ arc_reclaim_thread(void) int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3437,57 +3833,12 @@ arc_reclaim_thread(void) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } -static void -arc_user_evicts_thread(void) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_user_evicts_cv, - &arc_user_evicts_lock, ddi_get_lbolt() + hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - thread_exit(); -} - /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -3571,18 +3922,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -3622,12 +3972,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -3635,11 +3986,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -3652,9 +4001,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -3664,6 +4014,37 @@ arc_get_data_buf(arc_buf_t *buf) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -3707,7 +4088,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -3741,7 +4122,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -3810,8 +4191,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -3820,7 +4201,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -3828,18 +4209,30 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } } +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -3859,31 +4252,32 @@ arc_read_done(zio_t *zio) arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot_byteswap[bswap].ob_func; - func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); - arc_buf_watch(buf); + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -3897,31 +4291,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -3991,7 +4404,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4008,8 +4420,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4041,7 +4453,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4062,10 +4475,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } @@ -4088,34 +4500,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4125,20 +4539,21 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4148,26 +4563,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4178,54 +4576,60 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; } + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4234,6 +4638,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4241,19 +4650,15 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4274,15 +4679,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + lsize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4291,26 +4694,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, buf->b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + size, hdr->b_l1hdr.b_pdata, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4340,8 +4736,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -4352,20 +4748,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, return (0); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4381,85 +4763,38 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); - } else { + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); mutex_exit(hash_lock); - } - -} - -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); + mutex_exit(hash_lock); } - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); } /* @@ -4493,16 +4828,19 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4543,72 +4881,111 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); + arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); - uint64_t *size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); - } - hdr->b_l1hdr.b_datacnt -= 1; + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - + nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -4622,8 +4999,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -4657,28 +5032,83 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); + arc_buf_unwatch(buf); + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -4709,9 +5139,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -4719,7 +5151,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -4728,7 +5160,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -4762,19 +5194,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -4784,9 +5216,8 @@ arc_write_done(zio_t *zio) } zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) @@ -4795,16 +5226,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -4813,7 +5242,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -4908,12 +5360,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); @@ -4925,8 +5379,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -4962,39 +5418,150 @@ arc_kstat_update(kstat_t *ksp, int rw) return (0); } -/* - * This function *must* return indices evenly distributed between all - * sublists of the multilist. This is needed due to how the ARC eviction - * code is laid out; arc_evict_state() assumes ARC buffers are evenly - * distributed between all sublists and uses this assumption when - * deciding which sublist to evict from and how much to evict from it. - */ -unsigned int -arc_state_multilist_index_func(multilist_t *ml, void *obj) +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!HDR_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) { - arc_buf_hdr_t *hdr = obj; - - /* - * We rely on b_dva to generate evenly distributed index - * numbers using buf_hash below. So, as an added precaution, - * let's make sure we never add empty buffers to the arc lists. - */ - ASSERT(!BUF_EMPTY(hdr)); - - /* - * The assumption here, is the hash value for a given - * arc_buf_hdr_t will remain constant throughout it's lifetime - * (i.e. it's b_spa, b_dva, and b_birth fields don't change). - * Thus, we don't need to store the header's sublist index - * on insertion, as this index can be recalculated on removal. - * - * Also, the low order bits of the hash value are thought to be - * distributed evenly. Otherwise, in the case that the multilist - * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. - */ - return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % - multilist_get_num_sublists(ml)); + return (arc_c_max); } void @@ -5013,9 +5580,6 @@ arc_init(void) cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -5049,6 +5613,7 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; @@ -5099,68 +5664,10 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - - refcount_create(&arc_anon->arcs_size); - refcount_create(&arc_mru->arcs_size); - refcount_create(&arc_mru_ghost->arcs_size); - refcount_create(&arc_mfu->arcs_size); - refcount_create(&arc_mfu_ghost->arcs_size); - refcount_create(&arc_l2c_only->arcs_size); - + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; - arc_eviction_list = NULL; - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + arc_reclaim_thread_exit = B_FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5174,10 +5681,7 @@ arc_init(void) (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5200,10 +5704,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5211,22 +5715,10 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); - - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5237,27 +5729,7 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -5546,9 +6018,13 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -5571,13 +6047,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -5635,32 +6111,26 @@ l2arc_write_done(zio_t *zio) */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -5687,43 +6157,36 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -5736,7 +6199,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -5749,9 +6212,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -5901,12 +6365,11 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -5927,36 +6390,22 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, - buf_compress_minsz; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); - const boolean_t do_headroom_boost = *headroom_boost; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; - - /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. @@ -5977,13 +6426,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -5998,7 +6445,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6012,15 +6459,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -6044,63 +6483,76 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ZIO_FLAG_CANFAIL); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing). + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t size = arc_hdr_size(hdr); + + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + void *to_write; + if (!HDR_SHARED_DATA(hdr)) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(size); + } - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + l2arc_free_data_on_write(to_write, size, type); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, size, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); + write_sz += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + write_asize += size; /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * Keep the clock hand suitably device-aligned. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + size); + write_psize += asize; + dev->l2ad_hand += asize; mutex_exit(hash_lock); - write_sz += buf_sz; - write_asize += buf_a_sz; + (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -6117,104 +6569,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (0); } - mutex_enter(&dev->l2ad_mtx); - - /* - * Note that elsewhere in this file arcstat_l2_asize - * and the used space on l2ad_vdev are updated using b_asize, - * which is not necessarily rounded up to the device block size. - * Too keep accounting consistent we do the same here as well: - * stats_size accumulates the sum of b_asize of the written buffers, - * while write_asize accumulates the sum of b_asize rounded up - * to the device block size. - * The latter sum is used only to validate the corectness of the code. - */ - uint64_t stats_size = 0; - write_asize = 0; - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - if ((HDR_L2COMPRESS(hdr)) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(hdr)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - } - - /* - * Pick up the buffer data we had previously stashed away - * (and now potentially also compressed). - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - buf_sz = hdr->b_l2hdr.b_asize; - - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); - - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - uint64_t buf_a_sz; - - wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - (void) zio_nowait(wzio); - - stats_size += buf_sz; - - /* - * Keep the clock hand suitably device-aligned. - */ - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_asize += buf_a_sz; - dev->l2ad_hand += buf_a_sz; - } - } - - mutex_exit(&dev->l2ad_mtx); - ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, stats_size); - vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -6232,182 +6592,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (write_asize); } -/* - * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The original l2hdr contents were left - * untouched and are ready for writing to an L2 device. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_compress_buf(arc_buf_hdr_t *hdr) -{ - void *cdata; - size_t csize, len, rounded; - ASSERT(HDR_HAS_L2HDR(hdr)); - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - ASSERT3P(cdata, !=, NULL); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, - cdata, l2hdr->b_asize); - - rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } else if (csize > 0 && csize < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression failed, release the compressed buffer. - * l2hdr will be left unmodified. - */ - zio_data_buf_free(cdata, len); - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - uint64_t csize; - void *cdata; - - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - enum zio_compress comp = hdr->b_l2hdr.b_compress; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (comp == ZIO_COMPRESS_OFF) { - /* - * In this case, b_tmp_cdata points to the same buffer - * as the arc_buf_t's b_data field. We don't want to - * free it, since the arc_buf_t will handle that. - */ - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else if (comp == ZIO_COMPRESS_EMPTY) { - /* - * In this case, b_tmp_cdata was compressed to an empty - * buffer, thus there's nothing to free and b_tmp_cdata - * should have been set to NULL in l2arc_write_buffers(). - */ - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - /* - * If the data was compressed, then we've allocated a - * temporary buffer for it, so now we need to release it. - */ - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - -} - /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -6420,7 +6604,6 @@ l2arc_feed_thread(void) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -6458,7 +6641,7 @@ l2arc_feed_thread(void) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -6491,7 +6674,7 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -6584,7 +6767,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 4f469fc7500c..7540168f7bcf 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -45,6 +45,9 @@ #include #include #include +#include + +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing @@ -52,7 +55,6 @@ */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -64,9 +66,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -76,6 +145,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); @@ -88,6 +158,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -117,8 +188,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -129,7 +198,7 @@ dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; @@ -180,7 +249,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; @@ -212,7 +281,7 @@ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; @@ -237,8 +306,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -323,15 +390,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } - dbuf_clear(db); - dbuf_destroy(db); + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; + + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } void @@ -359,18 +592,38 @@ dbuf_init(void) goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -382,8 +635,23 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -541,7 +809,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -556,8 +824,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -568,6 +834,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -579,6 +846,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -647,7 +915,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -696,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -733,8 +1001,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -851,7 +1117,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -907,9 +1173,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1033,7 +1300,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1138,7 +1405,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1149,7 +1416,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1547,7 +1814,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1556,12 +1823,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1725,7 +1988,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1743,10 +2006,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1758,59 +2021,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1821,15 +2087,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* * If this dbuf is referenced from an indirect dbuf, @@ -1922,7 +2198,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -1971,7 +2247,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -1996,76 +2272,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2103,10 +2315,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2135,7 +2374,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2229,6 +2469,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2309,18 +2550,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, return (SET_ERROR(ENOENT)); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } + if (db->db_buf != NULL) ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); - } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); @@ -2338,13 +2569,19 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, + arc_alloc_buf(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(refcount_is_zero(&db->db_holds)); + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -2418,7 +2655,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); + ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2489,8 +2726,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2535,55 +2774,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -2867,7 +3095,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3133,10 +3361,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3152,8 +3377,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3330,8 +3553,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, + &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 35015825b4e1..a8c0dc5ad654 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1318,7 +1318,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1663,8 +1663,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2035,10 +2034,10 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/usr/src/uts/common/fs/zfs/dmu_diff.c b/usr/src/uts/common/fs/zfs/dmu_diff.c index 7665d1ca591d..982b96132cc8 100644 --- a/usr/src/uts/common/fs/zfs/dmu_diff.c +++ b/usr/src/uts/common/fs/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -146,7 +146,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 78ab9624d35e..efad81dd2cdd 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -1122,7 +1118,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 8d4e07c6c623..c3f52d33e7e5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -608,7 +608,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -620,7 +620,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -644,7 +644,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; @@ -674,7 +674,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 2822ca45250d..5075818de490 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -379,7 +379,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -594,7 +594,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 024c64897b74..4126da22794a 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -509,7 +509,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 7179c41cbfe3..daf539ec5cbe 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); @@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 21a5787e42c7..0a382ee1d9db 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -651,7 +651,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -677,7 +677,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -709,7 +709,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index df0f25684958..1e6229877bde 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -227,4 +227,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} #endif /* ZFS_DEBUG */ diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index f886a2ae3a75..b1e9456f5a0d 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -43,51 +43,83 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); -typedef int arc_evict_func_t(void *private); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { @@ -95,11 +127,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -119,19 +150,17 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 496412614b16..6862599a6540 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index 27c39135e056..d13a87ea86e2 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -71,6 +71,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); void refcount_init(void); void refcount_fini(void); @@ -98,6 +99,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) #define refcount_init() #define refcount_fini() diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index d8840bf86db8..da63812831fd 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -133,6 +133,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -375,8 +377,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 6d8f7601f350..8061c81e0b45 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -522,6 +522,10 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h index 572b29d3cb1f..2f7579fd7334 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -97,8 +97,12 @@ extern zio_checksum_t zio_checksum_edonr_byteswap; extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 590f1dfff87e..72a24580ff97 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -244,7 +245,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -281,7 +282,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 1acc8b2e6aa6..30798b6541f7 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -266,7 +266,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -284,7 +284,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -952,8 +952,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + void *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -2263,7 +2263,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index 8bd7e02bef38..469af54477df 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -293,20 +293,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; - spa_t *spa = zio->io_spa; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); @@ -315,6 +307,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -354,35 +347,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) spa->spa_cksum_tmpls[checksum], &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); } /*