From 21306c0025877721b53d7eb8b3f9131a80491ed4 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Thu, 23 Jul 2015 08:13:45 -0500 Subject: [PATCH 01/38] Refresh dkio.h and add dkioc_free_util.h Update dkio.h from Nexenta's version to pick up DKIOCFREE and add their dkioc_free_util.h header for TRIM support. Requires-builders: none --- lib/libspl/include/sys/Makefile.am | 1 + lib/libspl/include/sys/dkio.h | 84 ++++++++++++++++++++++-- lib/libspl/include/sys/dkioc_free_util.h | 33 ++++++++++ 3 files changed, 113 insertions(+), 5 deletions(-) create mode 100644 lib/libspl/include/sys/dkioc_free_util.h diff --git a/lib/libspl/include/sys/Makefile.am b/lib/libspl/include/sys/Makefile.am index e7af317e0c6c..e0c6e64a3b83 100644 --- a/lib/libspl/include/sys/Makefile.am +++ b/lib/libspl/include/sys/Makefile.am @@ -11,6 +11,7 @@ libspl_HEADERS = \ $(top_srcdir)/lib/libspl/include/sys/cred.h \ $(top_srcdir)/lib/libspl/include/sys/debug.h \ $(top_srcdir)/lib/libspl/include/sys/dkio.h \ + $(top_srcdir)/lib/libspl/include/sys/dkioc_free_util.h \ $(top_srcdir)/lib/libspl/include/sys/dklabel.h \ $(top_srcdir)/lib/libspl/include/sys/errno.h \ $(top_srcdir)/lib/libspl/include/sys/feature_tests.h \ diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h index 2e6b9a1a9d41..33312deab0e8 100644 --- a/lib/libspl/include/sys/dkio.h +++ b/lib/libspl/include/sys/dkio.h @@ -18,17 +18,19 @@ * * CDDL HEADER END */ + /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. */ #ifndef _SYS_DKIO_H #define _SYS_DKIO_H - - #include /* Needed for NDKMAP define */ +#include /* Needed for UINT16_MAX */ #ifdef __cplusplus extern "C" { @@ -83,9 +85,10 @@ struct dk_cinfo { #define DKC_MD 16 /* meta-disk (virtual-disk) driver */ #define DKC_INTEL82077 19 /* 82077 floppy disk controller */ #define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ -#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type (Obsolete) */ #define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ #define DKC_VBD 23 /* virtual block device */ +#define DKC_BLKDEV 24 /* generic block device (see blkdev(7d)) */ /* * Sun reserves up through 1023 @@ -166,6 +169,9 @@ struct dk_geom { #define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ #define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ +#define DKIOCGEXTVTOC (DKIOC|23) /* Get extended VTOC */ +#define DKIOCSEXTVTOC (DKIOC|24) /* Set extended VTOC, Write to Disk */ + /* * Disk Cache Controls. These ioctls should be supported by * all disk drivers. @@ -228,6 +234,14 @@ struct dk_callback { */ #define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ +#if defined(__i386) || defined(__amd64) +/* ioctl to write extended partition structure into the disk */ +#define DKIOCSETEXTPART (DKIOC|46) +#endif + +/* ioctl to report whether the disk is solid state or not - used for ZFS */ +#define DKIOCSOLIDSTATE (DKIOC|38) + /* * Ioctl to force driver to re-read the alternate partition and rebuild * the internal defect map. @@ -252,6 +266,9 @@ struct defect_header { }; #define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ +#define DKIOCEXTPARTINFO (DKIOC|19) /* Get extended partition or slice */ + /* parameters */ + /* * Used by applications to get partition or slice information @@ -268,6 +285,11 @@ struct part_info { int p_length; }; +struct extpart_info { + diskaddr_t p_start; + diskaddr_t p_length; +}; + /* The following ioctls are for Optical Memory Device */ #define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ #define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ @@ -290,6 +312,16 @@ enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; */ #define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ +/* + * ioctl to get the media info including physical block size + */ +#define DKIOCGMEDIAINFOEXT (DKIOC|48) + +/* + * ioctl to determine whether media is write-protected + */ +#define DKIOCREADONLY (DKIOC|49) + /* * Used for providing the temperature. */ @@ -313,6 +345,17 @@ struct dk_minfo { diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */ }; +/* + * Used for Media info or the current profile info + * including physical block size if supported. + */ +struct dk_minfo_ext { + uint_t dki_media_type; /* Media type or profile info */ + uint_t dki_lbsize; /* Logical blocksize of media */ + diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */ + uint_t dki_pbsize; /* Physical blocksize of media */ +}; + /* * Media types or profiles known */ @@ -358,6 +401,9 @@ struct dk_minfo { #define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ #define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ +#define DKIOCDUMPINIT (DKIOC | 28) /* Dumpify a zvol */ +#define DKIOCDUMPFINI (DKIOC | 29) /* Un-Dumpify a zvol */ + typedef uint_t volcapinfo_t; typedef uint_t volcapset_t; @@ -476,6 +522,34 @@ typedef struct dk_updatefw_32 { #define FW_TYPE_TEMP 0x0 /* temporary use */ #define FW_TYPE_PERM 0x1 /* permanent use */ +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) + +#define DF_WAIT_SYNC 0x00000001 /* Wait for full write-out of free. */ +typedef struct dkioc_free_list_ext_s { + uint64_t dfle_start; + uint64_t dfle_length; +} dkioc_free_list_ext_t; + +typedef struct dkioc_free_list_s { + uint64_t dfl_flags; + uint64_t dfl_num_exts; + int64_t dfl_offset; + + /* + * N.B. this is only an internal debugging API! This is only called + * from debug builds of sd for pre-release checking. Remove before GA! + */ + void (*dfl_ck_func)(uint64_t, uint64_t, void *); + void *dfl_ck_arg; + + dkioc_free_list_ext_t dfl_exts[1]; +} dkioc_free_list_t; +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) #ifdef __cplusplus } diff --git a/lib/libspl/include/sys/dkioc_free_util.h b/lib/libspl/include/sys/dkioc_free_util.h new file mode 100644 index 000000000000..b4d7da4cf7af --- /dev/null +++ b/lib/libspl/include/sys/dkioc_free_util.h @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Nexenta Inc. All rights reserved. + */ + +#ifndef _SYS_DKIOC_FREE_UTIL_H +#define _SYS_DKIOC_FREE_UTIL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void dfl_free(dkioc_free_list_t *dfl) { + vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKIOC_FREE_UTIL_H */ From aec754265238b93066e6c5758e14fe4d5e906c85 Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Mon, 20 Apr 2015 15:32:08 +0200 Subject: [PATCH 02/38] 6363 Add UNMAP/TRIM functionality to ZFS Ported by: Tim Chase Porting notes: The trim kstats are in zfs/ along with the other per-pool stats. The kstats can be cleared by writing to the kstat file. Null format parameters to strftime() were replaced with "%c". Added vdev trace support. New dfl_alloc() function in the SPL is used to allocate arrays of dkioc_free_list_t objects since they may be large enough to require virtual memory. Other changes: Suppressed kstat creation for pools with "$" names. The changes to vdev_raidz_map_alloc() have been minimized in order to allow more conflict-free merging with future changes (ABD). Added the following module parameters: zfs_trim - Enable TRIM zfs_trim_min_ext_sz - Minimum size to trim zfs_txgs_per_trim - Transaction groups over which to batch trims Requires-builders: none --- cmd/zpool/zpool_main.c | 183 +++++- configure.ac | 1 + include/libzfs.h | 1 + include/sys/dmu.h | 4 +- include/sys/fs/zfs.h | 17 +- include/sys/metaslab.h | 7 + include/sys/metaslab_impl.h | 12 + include/sys/range_tree.h | 4 + include/sys/spa.h | 57 +- include/sys/spa_impl.h | 33 +- include/sys/sysevent/eventdefs.h | 2 + include/sys/trace_vdev.h | 77 +++ include/sys/vdev.h | 12 + include/sys/vdev_impl.h | 22 + include/sys/zfs_context.h | 14 +- include/sys/zio.h | 21 + include/sys/zio_impl.h | 6 + include/sys/zio_priority.h | 10 + lib/libspl/include/sys/dkio.h | 7 +- lib/libspl/include/sys/dkioc_free_util.h | 7 +- lib/libzfs/libzfs_pool.c | 22 + lib/libzfs/libzfs_util.c | 1 + man/man8/zpool.8 | 3 + module/zcommon/zpool_prop.c | 8 +- module/zfs/dsl_scan.c | 4 + module/zfs/dsl_synctask.c | 1 - module/zfs/metaslab.c | 586 +++++++++++++++++- module/zfs/range_tree.c | 25 + module/zfs/spa.c | 354 +++++++++++ module/zfs/spa_config.c | 15 +- module/zfs/spa_misc.c | 242 +++++++- module/zfs/trace.c | 1 + module/zfs/vdev.c | 214 +++++++ module/zfs/vdev_disk.c | 76 ++- module/zfs/vdev_file.c | 87 ++- module/zfs/vdev_label.c | 8 +- module/zfs/vdev_mirror.c | 69 +-- module/zfs/vdev_missing.c | 52 +- module/zfs/vdev_queue.c | 42 +- module/zfs/vdev_raidz.c | 184 +++++- module/zfs/vdev_root.c | 27 +- module/zfs/zfs_ioctl.c | 32 + module/zfs/zio.c | 237 ++++++- module/zfs/zvol.c | 2 +- tests/runfiles/linux.run | 3 + tests/zfs-tests/tests/functional/Makefile.am | 1 + .../cli_root/zpool_get/zpool_get.cfg | 2 + .../tests/functional/trim/Makefile.am | 8 + .../functional/trim/autotrim_001_pos.ksh | 114 ++++ .../tests/functional/trim/cleanup.ksh | 31 + .../functional/trim/manualtrim_001_pos.ksh | 100 +++ .../zfs-tests/tests/functional/trim/setup.ksh | 36 ++ .../zfs-tests/tests/functional/trim/trim.cfg | 60 ++ .../tests/functional/trim/trim.kshlib | 35 ++ 54 files changed, 2979 insertions(+), 200 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/trim/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/trim/trim.cfg create mode 100644 tests/zfs-tests/tests/functional/trim/trim.kshlib diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index b96aaf3b7dcf..128fbc1773ee 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. @@ -100,6 +100,7 @@ static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); +static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); @@ -154,6 +155,7 @@ typedef enum { HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, + HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, @@ -284,6 +286,8 @@ static zpool_command_t command_table[] = { { "scrub", zpool_do_scrub, HELP_SCRUB }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { NULL }, + { "trim", zpool_do_trim, HELP_TRIM }, + { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, @@ -370,6 +374,8 @@ get_usage(zpool_help_t idx) return (gettext("\tscrub [-s | -p] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); + case HELP_TRIM: + return (gettext("\ttrim [-s|-r ] ...\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " "[-igLpPsvxD] [-T d|u] [pool] ... \n" @@ -6561,6 +6567,31 @@ scrub_callback(zpool_handle_t *zhp, void *data) return (err != 0); } +typedef struct trim_cbdata { + boolean_t cb_start; + uint64_t cb_rate; +} trim_cbdata_t; + +int +trim_callback(zpool_handle_t *zhp, void *data) +{ + trim_cbdata_t *cb = data; + int err; + + /* + * Ignore faulted pools. + */ + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot trim '%s': pool is " + "currently unavailable\n"), zpool_get_name(zhp)); + return (1); + } + + err = zpool_trim(zhp, cb->cb_start, cb->cb_rate); + + return (err != 0); +} + /* * zpool scrub [-s | -p] ... * @@ -6649,6 +6680,47 @@ zpool_do_resilver(int argc, char **argv) return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } +/* + * zpool trim [-s|-r ] ... + * + * -s Stop. Stops any in-progress trim. + * -r Sets the TRIM rate. + */ +int +zpool_do_trim(int argc, char **argv) +{ + int c; + trim_cbdata_t cb; + + cb.cb_start = B_TRUE; + cb.cb_rate = 0; + + /* check options */ + while ((c = getopt(argc, argv, "sr:")) != -1) { + switch (c) { + case 's': + cb.cb_start = B_FALSE; + break; + case 'r': + if (zfs_nicestrtonum(NULL, optarg, &cb.cb_rate) == -1) { + (void) fprintf(stderr, + gettext("invalid value for rate\n")); + usage(B_FALSE); + } + break; + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, trim_callback, &cb)); +} /* * Print out detailed scrub status. @@ -6972,6 +7044,59 @@ print_checkpoint_status(pool_checkpoint_stat_t *pcs) space_buf); } +static void +print_trim_status(uint64_t trim_prog, uint64_t total_size, uint64_t rate, + uint64_t start_time_u64, uint64_t end_time_u64) +{ + time_t start_time = start_time_u64, end_time = end_time_u64; + char *buf; + + assert(trim_prog <= total_size); + if (trim_prog != 0 && trim_prog != total_size) { + buf = ctime(&start_time); + buf[strlen(buf) - 1] = '\0'; /* strip trailing newline */ + if (rate != 0) { + char rate_str[32]; + zfs_nicenum(rate, rate_str, sizeof (rate_str)); + (void) printf(" trim: %.02f%%\tstarted: %s\t" + "(rate: %s/s)\n", (((double)trim_prog) / + total_size) * 100, buf, rate_str); + } else { + (void) printf(" trim: %.02f%%\tstarted: %s\t" + "(rate: max)\n", (((double)trim_prog) / + total_size) * 100, buf); + } + } else { + if (start_time != 0) { + /* + * Non-zero start time means we were run at some point + * in the past. + */ + if (end_time != 0) { + /* Non-zero end time means we completed */ + time_t diff = end_time - start_time; + int hrs, mins; + + buf = ctime(&end_time); + buf[strlen(buf) - 1] = '\0'; + hrs = diff / 3600; + mins = (diff % 3600) / 60; + (void) printf(gettext(" trim: completed on %s " + "(after %dh%dm)\n"), buf, hrs, mins); + } else { + buf = ctime(&start_time); + buf[strlen(buf) - 1] = '\0'; + /* Zero end time means we were interrupted */ + (void) printf(gettext(" trim: interrupted\t" + "(started %s)\n"), buf); + } + } else { + /* trim was never run */ + (void) printf(gettext(" trim: none requested\n")); + } + } +} + static void print_error_log(zpool_handle_t *zhp) { @@ -7083,6 +7208,43 @@ print_dedup_stats(nvlist_t *config) zpool_dump_ddt(dds, ddh); } +/* + * Calculates the total space available on log devices on the pool. + * For whatever reason, this is not counted in the root vdev's space stats. + */ +static uint64_t +zpool_slog_space(nvlist_t *nvroot) +{ + nvlist_t **newchild; + uint_t c, children; + uint64_t space = 0; + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &newchild, &children) == 0); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE; + vdev_stat_t *vs; + uint_t n; + uint_t n_subchildren = 1; + nvlist_t **subchild; + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, + &islog); + if (!islog) + continue; + verify(nvlist_lookup_uint64_array(newchild[c], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &n) == 0); + + /* vdev can be non-leaf, so multiply by number of children */ + (void) nvlist_lookup_nvlist_array(newchild[c], + ZPOOL_CONFIG_CHILDREN, &subchild, &n_subchildren); + space += n_subchildren * vs->vs_space; + } + + return (space); +} + /* * Display a summary of pool status. Displays a summary such as: * @@ -7400,6 +7562,7 @@ status_callback(zpool_handle_t *zhp, void *data) pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; + uint64_t trim_prog, trim_rate, trim_start_time, trim_stop_time; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); @@ -7418,6 +7581,24 @@ status_callback(zpool_handle_t *zhp, void *data) if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; + /* Grab trim stats if the pool supports it */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_TRIM_PROG, + &trim_prog) == 0 && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TRIM_RATE, + &trim_rate) == 0 && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TRIM_START_TIME, + &trim_start_time) == 0 && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TRIM_STOP_TIME, + &trim_stop_time) == 0) { + /* + * For whatever reason, root vdev_stats_t don't + * include log devices. + */ + print_trim_status(trim_prog, vs->vs_space + + zpool_slog_space(nvroot), trim_rate, + trim_start_time, trim_stop_time); + } + (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", diff --git a/configure.ac b/configure.ac index 7a84c249a117..e773d35e89df 100644 --- a/configure.ac +++ b/configure.ac @@ -327,6 +327,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/alloc_class/Makefile tests/zfs-tests/tests/functional/threadsappend/Makefile tests/zfs-tests/tests/functional/tmpfile/Makefile + tests/zfs-tests/tests/functional/trim/Makefile tests/zfs-tests/tests/functional/truncate/Makefile tests/zfs-tests/tests/functional/user_namespace/Makefile tests/zfs-tests/tests/functional/userquota/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index 85b0bc0ddb77..78b3bd3cbe09 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -258,6 +258,7 @@ typedef struct splitflags { extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); +extern int zpool_trim(zpool_handle_t *, boolean_t start, uint64_t rate); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen_one(zpool_handle_t *, void *); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 63c51ecfb3a5..440da32cb9f8 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -383,6 +383,8 @@ typedef struct dmu_buf { #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" +#define DMU_POOL_TRIM_START_TIME "trim_start_time" +#define DMU_POOL_TRIM_STOP_TIME "trim_stop_time" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 945853739b7c..4a4ed441ac5f 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017 Datto Inc. @@ -243,6 +243,8 @@ typedef enum { ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, + ZPOOL_PROP_FORCETRIM, + ZPOOL_PROP_AUTOTRIM, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -733,6 +735,10 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" +#define ZPOOL_CONFIG_TRIM_PROG "trim_prog" +#define ZPOOL_CONFIG_TRIM_RATE "trim_rate" +#define ZPOOL_CONFIG_TRIM_START_TIME "trim_start_time" +#define ZPOOL_CONFIG_TRIM_STOP_TIME "trim_stop_time" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" @@ -903,6 +909,14 @@ typedef struct pool_checkpoint_stat { uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; +/* + * TRIM command configuration info. + */ +typedef struct trim_cmd_info_s { + uint64_t tci_start; /* B_TRUE = start; B_FALSE = stop */ + uint64_t tci_rate; /* requested TRIM rate in bytes/sec */ +} trim_cmd_info_t; + /* * ZIO types. Needed to interpret vdev statistics below. */ @@ -1222,6 +1236,7 @@ typedef enum zfs_ioc { ZFS_IOC_EVENTS_NEXT, /* 0x5a81 */ ZFS_IOC_EVENTS_CLEAR, /* 0x5a82 */ ZFS_IOC_EVENTS_SEEK, /* 0x5a83 */ + ZFS_IOC_POOL_TRIM, /* 0x5a84 */ /* * FreeBSD - 1/64 numbers reserved. diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index f47bc19cfc2b..610720b8ebe8 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -56,6 +57,8 @@ void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); +void metaslab_auto_trim(metaslab_t *, uint64_t, boolean_t); +uint64_t metaslab_trim_mem_used(metaslab_t *); /* * metaslab alloc flags @@ -82,6 +85,7 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); +zio_t *metaslab_trim_all(metaslab_t *, uint64_t *, uint64_t *, boolean_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); @@ -118,6 +122,9 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); +void metaslab_trimstats_create(spa_t *spa); +void metaslab_trimstats_destroy(spa_t *spa); + #ifdef __cplusplus } #endif diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 137a8476924a..56967e9a5ab0 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -278,6 +279,11 @@ struct metaslab_group { kcondvar_t mg_ms_initialize_cv; }; +typedef struct { + uint64_t ts_birth; /* TXG at which this trimset starts */ + range_tree_t *ts_tree; /* tree of extents in the trimset */ +} metaslab_trimset_t; + /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. @@ -352,6 +358,11 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; + metaslab_trimset_t *ms_cur_ts; /* currently prepared trims */ + metaslab_trimset_t *ms_prev_ts; /* previous (aging) trims */ + kcondvar_t ms_trim_cv; + metaslab_trimset_t *ms_trimming_ts; + /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty @@ -363,6 +374,7 @@ struct metaslab { range_tree_t *ms_checkpointing; /* to add to the checkpoint */ boolean_t ms_condensing; /* condensing? */ + kcondvar_t ms_condensing_cv; boolean_t ms_condense_wanted; uint64_t ms_condense_checked_txg; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 7f79786f56dd..4d1aaba5d53f 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H @@ -92,6 +93,9 @@ boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); +boolean_t range_tree_contains_part(range_tree_t *rt, uint64_t start, + uint64_t size); +uint64_t range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/include/sys/spa.h b/include/sys/spa.h index febf0e8f241b..ddb19b1fadfc 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -738,6 +738,28 @@ typedef enum spa_import_type { SPA_IMPORT_ASSEMBLE } spa_import_type_t; +/* + * Should we force sending TRIM commands even to devices which evidently + * don't support it? + * OFF: no, only send to devices which indicated support + * ON: yes, force send to everybody + */ +typedef enum { + SPA_FORCE_TRIM_OFF = 0, /* default */ + SPA_FORCE_TRIM_ON +} spa_force_trim_t; + +/* + * Should we send TRIM commands in-line during normal pool operation while + * deleting stuff? + * OFF: no + * ON: yes + */ +typedef enum { + SPA_AUTO_TRIM_OFF = 0, /* default */ + SPA_AUTO_TRIM_ON +} spa_auto_trim_t; + /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, @@ -764,15 +786,16 @@ extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); -#define SPA_ASYNC_CONFIG_UPDATE 0x01 -#define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 -#define SPA_ASYNC_RESILVER_DONE 0x08 -#define SPA_ASYNC_RESILVER 0x10 -#define SPA_ASYNC_AUTOEXPAND 0x20 -#define SPA_ASYNC_REMOVE_DONE 0x40 -#define SPA_ASYNC_REMOVE_STOP 0x80 -#define SPA_ASYNC_INITIALIZE_RESTART 0x100 +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 +#define SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY 0x200 /* * Controls the behavior of spa_vdev_remove(). @@ -813,6 +836,13 @@ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); +/* trimming */ +extern void spa_man_trim(spa_t *spa, uint64_t rate); +extern void spa_man_trim_stop(spa_t *spa); +extern void spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate, + uint64_t *start_time, uint64_t *stop_time); +extern void spa_trim_stop_wait(spa_t *spa); + /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); @@ -1005,6 +1035,8 @@ extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); +extern spa_force_trim_t spa_get_force_trim(spa_t *spa); +extern spa_auto_trim_t spa_get_auto_trim(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...); @@ -1119,6 +1151,11 @@ extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); +/* TRIM/UNMAP kstat update */ +extern void spa_trimstats_update(spa_t *spa, uint64_t extents, uint64_t bytes, + uint64_t extents_skipped, uint64_t bytes_skipped); +extern void spa_trimstats_auto_slow_incr(spa_t *spa); + #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 404aaa9ee373..62feb2e6f97a 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. @@ -195,6 +195,8 @@ typedef enum spa_config_source { SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ } spa_config_source_t; +typedef struct spa_trimstats spa_trimstats_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -378,6 +380,31 @@ struct spa { uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ + + /* TRIM */ + uint64_t spa_force_trim; /* force sending trim? */ + uint64_t spa_auto_trim; /* see spa_auto_trim_t */ + + kmutex_t spa_auto_trim_lock; + kcondvar_t spa_auto_trim_done_cv; /* all autotrim thrd's exited */ + uint64_t spa_num_auto_trimming; /* # of autotrim threads */ + taskq_t *spa_auto_trim_taskq; + + kmutex_t spa_man_trim_lock; + uint64_t spa_man_trim_rate; /* rate of trim in bytes/sec */ + uint64_t spa_num_man_trimming; /* # of manual trim threads */ + boolean_t spa_man_trim_stop; /* requested manual trim stop */ + kcondvar_t spa_man_trim_update_cv; /* updates to TRIM settings */ + kcondvar_t spa_man_trim_done_cv; /* manual trim has completed */ + /* For details on trim start/stop times see spa_get_trim_prog. */ + uint64_t spa_man_trim_start_time; + uint64_t spa_man_trim_stop_time; + taskq_t *spa_man_trim_taskq; + + /* TRIM/UNMAP kstats */ + spa_trimstats_t *spa_trimstats; /* alloc'd by kstat_create */ + kstat_t *spa_trimstats_ks; + uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ spa_keystore_t spa_keystore; /* loaded crypto keys */ @@ -416,6 +443,10 @@ extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); +extern void spa_auto_trim_taskq_create(spa_t *spa); +extern void spa_man_trim_taskq_create(spa_t *spa); +extern void spa_auto_trim_taskq_destroy(spa_t *spa); +extern void spa_man_trim_taskq_destroy(spa_t *spa); #ifdef __cplusplus } diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index aa13bd5052c7..adc83861f6ec 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -118,6 +118,8 @@ extern "C" { #define ESC_ZFS_BOOTFS_VDEV_ATTACH "bootfs_vdev_attach" #define ESC_ZFS_POOL_REGUID "pool_reguid" #define ESC_ZFS_HISTORY_EVENT "history_event" +#define ESC_ZFS_TRIM_START "trim_start" +#define ESC_ZFS_TRIM_FINISH "trim_finish" /* * datalink subclass definitions. diff --git a/include/sys/trace_vdev.h b/include/sys/trace_vdev.h index d7af44c25397..98c4f6888eef 100644 --- a/include/sys/trace_vdev.h +++ b/include/sys/trace_vdev.h @@ -108,6 +108,83 @@ DEFINE_EVENT(zfs_removing_class_4, name, \ /* END CSTYLED */ DEFINE_REMOVE_FREE_EVENT_TXG(zfs_remove__free__inflight); +/* + * Support for tracepoints of the form: + * + * DTRACE_PROBE3(..., + * vdev_t *vd, ..., + * uint64_t mused, ..., + * uint64_t mlim, ..., + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_vdev_mused_mlim_class, + TP_PROTO(vdev_t *vd, uint64_t mused, uint64_t mlim), + TP_ARGS(vd, mused, mlim), + TP_STRUCT__entry( + __field(uint64_t, vdev_id) + __field(uint64_t, vdev_guid) + __field(uint64_t, mused) + __field(uint64_t, mlim) + ), + TP_fast_assign( + __entry->vdev_id = vd->vdev_id; + __entry->vdev_guid = vd->vdev_guid; + __entry->mused = mused; + __entry->mlim = mlim; + ), + TP_printk("vd { vdev_id %llu vdev_guid %llu }" + " mused = %llu mlim = %llu", + __entry->vdev_id, __entry->vdev_guid, + __entry->mused, __entry->mlim) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_VDEV_MUSED_MLIM_EVENT(name) \ +DEFINE_EVENT(zfs_vdev_mused_mlim_class, name, \ + TP_PROTO(vdev_t *vd, uint64_t mused, uint64_t mlim), \ + TP_ARGS(vd, mused, mlim)) +/* END CSTYLED */ +DEFINE_VDEV_MUSED_MLIM_EVENT(zfs_autotrim__mem__lim); + +/* + * Generic support for tracepoints of the form: + * + * DTRACE_PROBE1(..., + * metaslab_t *, ..., + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_msp_class, + TP_PROTO(metaslab_t *msp), + TP_ARGS(msp), + TP_STRUCT__entry( + __field(uint64_t, ms_id) + __field(uint64_t, ms_start) + __field(uint64_t, ms_size) + __field(uint64_t, ms_fragmentation) + ), + TP_fast_assign( + __entry->ms_id = msp->ms_id; + __entry->ms_start = msp->ms_start; + __entry->ms_size = msp->ms_size; + __entry->ms_fragmentation = msp->ms_fragmentation; + ), + TP_printk("msp { ms_id %llu ms_start %llu ms_size %llu " + "ms_fragmentation %llu }", + __entry->ms_id, __entry->ms_start, + __entry->ms_size, __entry->ms_fragmentation) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_MSP_EVENT(name) \ +DEFINE_EVENT(zfs_msp_class, name, \ + TP_PROTO(metaslab_t *msp), \ + TP_ARGS(msp)) +/* END CSTYLED */ +DEFINE_MSP_EVENT(zfs_preserve__spilled); +DEFINE_MSP_EVENT(zfs_drop__spilled); + #endif /* _TRACE_VDEV_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 2091892b27da..3e4307d6fdf3 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -46,6 +47,13 @@ typedef enum vdev_dtl_type { DTL_TYPES } vdev_dtl_type_t; +typedef struct vdev_trim_info { + vdev_t *vti_vdev; + uint64_t vti_txg; /* ignored for manual trim */ + void (*vti_done_cb)(void *); + void *vti_done_arg; +} vdev_trim_info_t; + extern int zfs_nocacheflush; extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); @@ -163,6 +171,10 @@ extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); +extern void vdev_man_trim(vdev_trim_info_t *vti); +extern void vdev_auto_trim(vdev_trim_info_t *vti); +extern void vdev_trim_stop_wait(vdev_t *vd); + /* * Label routines */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 8f8a8ccf608e..13d990649c51 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -77,6 +78,8 @@ typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); +typedef void vdev_trim_func_t(vdev_t *vd, zio_t *pio, + dkioc_free_list_t *trim_exts, boolean_t auto_trim); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); @@ -100,11 +103,14 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; + /* * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. * Used when initializing vdevs. Isn't used by leaf ops. */ vdev_xlation_func_t *vdev_op_xlate; + + vdev_trim_func_t *vdev_op_trim; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -313,6 +319,20 @@ struct vdev { range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; + boolean_t vdev_man_trimming; /* manual trim is ongoing */ + uint64_t vdev_trim_prog; /* trim progress in bytes */ + /* + * Because trim zios happen outside of the DMU transactional engine, + * we cannot rely on the DMU quiescing async trim zios to the vdev + * before doing pool reconfiguration tasks. Therefore we count them + * separately and quiesce them using vdev_trim_stop_wait before + * removing or changing vdevs. + */ + kmutex_t vdev_trim_zios_lock; + kcondvar_t vdev_trim_zios_cv; + uint64_t vdev_trim_zios; /* # of in-flight async trim zios */ + boolean_t vdev_trim_zios_stop; /* see zio_trim_should_bypass */ + /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). @@ -343,6 +363,7 @@ struct vdev { uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if Unmap/TRIM is unsupported */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ @@ -478,6 +499,7 @@ extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); +extern boolean_t vdev_is_dirty(vdev_t *vd, int flags, void *arg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 11a32bb3117a..cb2ba3b77296 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -578,6 +578,8 @@ typedef struct vsecattr { #define CRCREAT 0 +#define F_FREESP 11 + extern int fop_getattr(vnode_t *vp, vattr_t *vap); #define VOP_CLOSE(vp, f, c, o, cr, ct) vn_close(vp) @@ -586,6 +588,16 @@ extern int fop_getattr(vnode_t *vp, vattr_t *vap); #define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) +#if defined(HAVE_FILE_FALLOCATE) && \ + defined(FALLOC_FL_PUNCH_HOLE) && \ + defined(FALLOC_FL_KEEP_SIZE) +#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) \ + fallocate((vp)->v_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, \ + (flck)->l_start, (flck)->l_len) +#else +#define VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) (0) +#endif + #define VN_RELE(vp) vn_close(vp) extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, diff --git a/include/sys/zio.h b/include/sys/zio.h index 4b7ad3e227e3..f3989c2086bb 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -26,6 +26,7 @@ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ZIO_H @@ -38,6 +39,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -280,6 +283,9 @@ typedef void zio_done_func_t(zio_t *zio); extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; +extern int zfs_trim; + +struct range_tree; /* * A bookmark is a four-tuple that uniquely @@ -334,6 +340,9 @@ struct zbookmark_phys { (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) +#define ZIO_IS_TRIM(zio) \ + ((zio)->io_type == ZIO_TYPE_IOCTL && (zio)->io_cmd == DKIOCFREE) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; @@ -465,6 +474,10 @@ struct zio { uint64_t io_size; uint64_t io_orig_size; + /* Used by trim zios */ + dkioc_free_list_t *io_dfl; + boolean_t io_dfl_free_on_destroy; + /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; @@ -549,6 +562,14 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); +extern zio_t *zio_trim_dfl(zio_t *pio, spa_t *spa, vdev_t *vd, + dkioc_free_list_t *dfl, boolean_t dfl_free_on_destroy, boolean_t auto_trim, + zio_done_func_t *done, void *private); + +extern zio_t *zio_trim_tree(zio_t *pio, spa_t *spa, vdev_t *vd, + struct range_tree *tree, boolean_t auto_trim, zio_done_func_t *done, + void *private, int dkiocfree_flags, metaslab_t *msp); + extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 344048c6a634..53ba94e2bc26 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ZIO_IMPL_H @@ -250,6 +251,11 @@ enum zio_stage { ZIO_STAGE_VDEV_IO_START | \ ZIO_STAGE_VDEV_IO_ASSESS) +#define ZIO_TRIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_VDEV_IO_STAGES) + #define ZIO_BLOCKING_STAGES \ (ZIO_STAGE_DVA_ALLOCATE | \ ZIO_STAGE_DVA_CLAIM | \ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index d8e6a1745969..971c8d940822 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -14,6 +14,7 @@ */ /* * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ZIO_PRIORITY_H #define _ZIO_PRIORITY_H @@ -30,6 +31,15 @@ typedef enum zio_priority { ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ + + /* + * Trims are separated into auto & manual trims. If a manual trim is + * initiated, auto trims are discarded late in the zio pipeline just + * prior to being issued. This lets manual trim start up much faster + * if a lot of auto trims have already been queued up. + */ + ZIO_PRIORITY_AUTO_TRIM, /* async auto trim operation */ + ZIO_PRIORITY_MAN_TRIM, /* manual trim operation */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h index 33312deab0e8..5b537dd959cc 100644 --- a/lib/libspl/include/sys/dkio.h +++ b/lib/libspl/include/sys/dkio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. */ @@ -541,7 +541,10 @@ typedef struct dkioc_free_list_s { /* * N.B. this is only an internal debugging API! This is only called - * from debug builds of sd for pre-release checking. Remove before GA! + * from debug builds of sd for integrity self-checking. The reason it + * isn't #ifdef DEBUG is because that breaks ABI compatibility when + * mixing DEBUG and non-DEBUG kernel modules and the cost of having + * a couple unused pointers is too low to justify that risk. */ void (*dfl_ck_func)(uint64_t, uint64_t, void *); void *dfl_ck_arg; diff --git a/lib/libspl/include/sys/dkioc_free_util.h b/lib/libspl/include/sys/dkioc_free_util.h index b4d7da4cf7af..902d5c0cef18 100644 --- a/lib/libspl/include/sys/dkioc_free_util.h +++ b/lib/libspl/include/sys/dkioc_free_util.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Nexenta Inc. All rights reserved. + * Copyright 2016 Nexenta Inc. All rights reserved. */ #ifndef _SYS_DKIOC_FREE_UTIL_H @@ -26,6 +26,11 @@ static inline void dfl_free(dkioc_free_list_t *dfl) { vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); } +static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) { + return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags)); +} + + #ifdef __cplusplus } #endif diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f799471e4351..06dd399c4a24 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2255,6 +2255,28 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) } } +/* + * Trim the pool. + */ +int +zpool_trim(zpool_handle_t *zhp, boolean_t start, uint64_t rate) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + trim_cmd_info_t tci = { .tci_start = start, .tci_rate = rate }; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = (uintptr_t)&tci; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_TRIM, &zc) == 0) + return (0); + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot trim %s"), zc.zc_name); + return (zpool_standard_error(hdl, errno, msg)); +} + /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index d7401cdf408a..d3a64331ab94 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -25,6 +25,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 8f35ca0ee7ce..6eabf49d9a97 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -195,6 +195,9 @@ .Cm sync .Oo Ar pool Oc Ns ... .Nm +.Cm trim +.Oo Fl pr Ar pool +.Nm .Cm upgrade .Nm .Cm upgrade diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 2d577793753e..acae01dac3fc 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -20,8 +20,8 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -130,6 +130,12 @@ zpool_prop_init(void) zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table); + zprop_register_index(ZPOOL_PROP_FORCETRIM, "forcetrim", + SPA_FORCE_TRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "FORCETRIM", boolean_table); + zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", + SPA_AUTO_TRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "AUTOTRIM", boolean_table); /* hidden properties */ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 3180ce65abee..aaf2b47972c6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -24,6 +24,7 @@ * Copyright 2016 Gary Mills * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -1012,6 +1013,9 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { + /* Stop any ongoing TRIMs */ + spa_man_trim_stop(dp->dp_spa); + if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index b63ce5cad90c..bafe36aac8fe 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -91,7 +91,6 @@ dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); goto top; } - spa_close(spa, FTAG); return (dst.dst_error); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 20e7f0ed38e0..4dbb08ac7db0 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include @@ -36,6 +37,7 @@ #include #include #include +#include #define WITH_DF_BLOCK_ALLOCATOR @@ -230,6 +232,43 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; #endif +/* + * How many TXG's worth of updates should be aggregated per TRIM/UNMAP + * issued to the underlying vdev. We keep two range trees of extents + * (called "trim sets") to be trimmed per metaslab, the `current' and + * the `previous' TS. New free's are added to the current TS. Then, + * once `zfs_txgs_per_trim' transactions have elapsed, the `current' + * TS becomes the `previous' TS and a new, blank TS is created to be + * the new `current', which will then start accumulating any new frees. + * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's + * extents are trimmed, the TS is destroyed and the current TS again + * becomes the previous TS. + * This serves to fulfill two functions: aggregate many small frees + * into fewer larger trim operations (which should help with devices + * which do not take so kindly to them) and to allow for disaster + * recovery (extents won't get trimmed immediately, but instead only + * after passing this rather long timeout, thus preserving + * 'zfs import -F' functionality). + */ +unsigned int zfs_txgs_per_trim = 32; +/* + * Maximum number of bytes we'll put into a single zio_trim. This is for + * vdev queue processing purposes and also because some devices advertise + * they can handle a lot more LBAs per command than they can handle + * efficiently. + */ +uint64_t zfs_max_bytes_per_trim = 128 << 20; + +static void metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size); +static void metaslab_trim_add(void *arg, uint64_t offset, uint64_t size); + +static zio_t *metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim); + +static metaslab_trimset_t *metaslab_new_trimset(uint64_t txg, kmutex_t *lock); +static void metaslab_free_trimset(metaslab_trimset_t *ts); +static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, + uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit); + /* * ========================================================================== * Metaslab classes @@ -1171,19 +1210,20 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, + uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); - while (rs != NULL) { + for (; rs != NULL; rs = AVL_NEXT(t, rs)) { uint64_t offset = P2ROUNDUP(rs->rs_start, align); - if (offset + size <= rs->rs_end) { + if (offset + size <= rs->rs_end && + !metaslab_check_trim_conflict(msp, &offset, size, align, + rs->rs_end)) { *cursor = offset + size; return (offset); } - rs = AVL_NEXT(t, rs); } /* @@ -1194,7 +1234,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ @@ -1218,7 +1258,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_allocatable->rt_root; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { @@ -1271,7 +1311,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { @@ -1308,12 +1348,19 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + for (rs = avl_last(&msp->ms_allocatable_by_size); + rs != NULL && rs->rs_end - rs->rs_start >= size; + rs = AVL_PREV(&msp->allocatable_by_size)) { + *cursor = rs->rs_start; + *cursor_end = rs->rs_end; + if (!metaslab_check_trim_conflict(msp, cursor, size, + 1, *cursor_end)) { + /* segment appears to be acceptable */ + break; + } + } + if (rs == NULL || rs->rs_end - rs->rs_start < size) return (-1ULL); - - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; } offset = *cursor; @@ -1354,6 +1401,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_block_maxsize(msp); + /* mutable copy for adjustment by metaslab_check_trim_conflict */ + uint64_t adjustable_start; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, @@ -1366,7 +1415,12 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + if (rs != NULL) + adjustable_start = rs->rs_start; + if (rs == NULL || rs->rs_end - adjustable_start < size || + metaslab_check_trim_conflict(msp, &adjustable_start, size, 1, + rs->rs_end)) { + /* segment not usable, try the largest remaining one */ t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; @@ -1376,13 +1430,17 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); + adjustable_start = rs->rs_start; + if (rs->rs_end - adjustable_start < size || + metaslab_check_trim_conflict(msp, &adjustable_start, + size, 1, rs->rs_end)) { + /* even largest remaining segment not usable */ + return (-1ULL); + } } - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); - } - return (-1ULL); + *cursor = adjustable_start + size; + return (*cursor); } static metaslab_ops_t metaslab_ndf_ops = { @@ -1457,6 +1515,8 @@ metaslab_load_impl(metaslab_t *msp) for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, msp->ms_allocatable); + range_tree_walk(msp->ms_defer[t], + metaslab_trim_remove, msp); } } msp->ms_max_size = metaslab_block_maxsize(msp); @@ -1523,6 +1583,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ms->ms_trim_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1546,6 +1607,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ASSERT(ms->ms_sm != NULL); } + ms->ms_cur_ts = metaslab_new_trimset(0, &ms->ms_lock); + /* * We create the main range tree here, but we don't create the * other range trees until metaslab_sync_done(). This serves @@ -1598,6 +1661,12 @@ metaslab_fini(metaslab_t *msp) metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + /* Wait for trimming to finish */ + mutex_enter(&msp->ms_lock); + while (msp->ms_trimming_ts != NULL) + cv_wait(&msp->ms_trim_cv, &msp->ms_lock); + mutex_exit(&msp->ms_lock); + metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); @@ -1620,12 +1689,19 @@ metaslab_fini(metaslab_t *msp) for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } + + metaslab_free_trimset(msp->ms_cur_ts); + if (msp->ms_prev_ts) + metaslab_free_trimset(msp->ms_prev_ts); + ASSERT3P(msp->ms_trimming_ts, ==, NULL); + ASSERT0(msp->ms_deferspace); range_tree_destroy(msp->ms_checkpointing); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); + cv_destroy(&msp->ms_trim_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); @@ -2663,6 +2739,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ + if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON && + !vd->vdev_man_trimming) + range_tree_walk(*defer_tree, metaslab_trim_add, msp); range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { @@ -2996,6 +3075,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); + metaslab_trim_remove(msp, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); @@ -3277,8 +3357,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); - + ASSERT(!metaslab_should_allocate(msp, asize) || + msp->ms_trimming_ts != NULL); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -3830,6 +3910,9 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); + if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON && + !vd->vdev_man_trimming) + metaslab_trim_add(msp, offset, size); mutex_exit(&msp->ms_lock); } @@ -3949,6 +4032,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); + metaslab_trim_remove(msp, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) @@ -4256,8 +4340,20 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - if (msp->ms_loaded) + if (msp->ms_loaded) { + VERIFY(&msp->ms_lock == msp->ms_tree->rt_lock); range_tree_verify(msp->ms_allocatable, offset, size); +#ifdef DEBUG + VERIFY(&msp->ms_lock == msp->ms_cur_ts->ts_tree->rt_lock); + range_tree_verify(msp->ms_cur_ts->ts_tree, offset, size); + if (msp->ms_prev_ts != NULL) { + VERIFY(&msp->ms_lock == + msp->ms_prev_ts->ts_tree->rt_lock); + range_tree_verify(msp->ms_prev_ts->ts_tree, + offset, size); + } +#endif + } range_tree_verify(msp->ms_freeing, offset, size); range_tree_verify(msp->ms_checkpointing, offset, size); @@ -4290,6 +4386,448 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) spa_config_exit(spa, SCL_VDEV, FTAG); } +/* + * Trims all free space in the metaslab. Returns the root TRIM zio (that the + * caller should zio_wait() for) and the amount of space in the metaslab that + * has been scheduled for trimming in the `delta' return argument. + */ +zio_t * +metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, + boolean_t *was_loaded) +{ + uint64_t cur = *cursor, trimmed_space = 0; + zio_t *trim_io = NULL; + range_seg_t rsearch, *rs; + avl_index_t where; + const uint64_t max_bytes = zfs_max_bytes_per_trim; + + ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); + ASSERT3U(cur, >=, msp->ms_start); + ASSERT3U(cur, <=, msp->ms_start + msp->ms_size); + + mutex_enter(&msp->ms_lock); + + while (msp->ms_condensing) + cv_wait(&msp->ms_condensing_cv, &msp->ms_lock); + + while (msp->ms_loading) + metaslab_load_wait(msp); + /* + * On the initial call we memorize if we had to load the metaslab + * for ourselves, so we can unload it when we're done. + */ + if (cur == msp->ms_start) + *was_loaded = msp->ms_loaded; + if (!msp->ms_loaded) { + if (metaslab_load(msp) != 0) { + /* Load failed, stop trimming this metaslab */ + *cursor = msp->ms_start + msp->ms_size; + mutex_exit(&msp->ms_lock); + return (NULL); + } + } + + /* + * Flush out any scheduled extents and add everything in ms_tree + * from the last cursor position, but not more than the trim run + * limit. + */ + range_tree_vacate(msp->ms_cur_ts->ts_tree, NULL, NULL); + + rsearch.rs_start = cur; + rsearch.rs_end = cur + SPA_MINBLOCKSIZE; + rs = avl_find(&msp->ms_tree->rt_root, &rsearch, &where); + if (rs == NULL) { + rs = avl_nearest(&msp->ms_tree->rt_root, where, AVL_AFTER); + if (rs != NULL) + cur = rs->rs_start; + } + + /* Clear out ms_prev_ts, since we'll be trimming everything. */ + if (msp->ms_prev_ts != NULL) { + metaslab_free_trimset(msp->ms_prev_ts); + msp->ms_prev_ts = NULL; + } + + while (rs != NULL && trimmed_space < max_bytes) { + uint64_t end; + if (cur < rs->rs_start) + cur = rs->rs_start; + end = MIN(cur + (max_bytes - trimmed_space), rs->rs_end); + metaslab_trim_add(msp, cur, end - cur); + trimmed_space += (end - cur); + cur = end; + if (cur == rs->rs_end) + rs = AVL_NEXT(&msp->ms_tree->rt_root, rs); + } + + if (trimmed_space != 0) { + /* Force this trim to take place ASAP. */ + msp->ms_prev_ts = msp->ms_cur_ts; + msp->ms_cur_ts = metaslab_new_trimset(0, &msp->ms_lock); + trim_io = metaslab_exec_trim(msp, B_FALSE); + ASSERT(trim_io != NULL); + + /* + * Not at the end of this metaslab yet, have vdev_man_trim + * come back around for another run. + */ + *cursor = cur; + } else { + *cursor = msp->ms_start + msp->ms_size; + if (!(*was_loaded) && !vdev_is_dirty(msp->ms_group->mg_vd, + VDD_METASLAB, msp) && msp->ms_activation_weight == 0) + metaslab_unload(msp); + } + + mutex_exit(&msp->ms_lock); + *delta = trimmed_space; + + return (trim_io); +} + +/* + * Notifies the trimsets in a metaslab that an extent has been allocated. + * This removes the segment from the queues of extents awaiting to be trimmed. + */ +static void +metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size) +{ + metaslab_t *msp = arg; + + range_tree_clear(msp->ms_cur_ts->ts_tree, offset, size); + if (msp->ms_prev_ts != NULL) + range_tree_clear(msp->ms_prev_ts->ts_tree, offset, size); +} + +/* + * Notifies the trimsets in a metaslab that an extent has been freed. + * This adds the segment to the currently open queue of extents awaiting + * to be trimmed. + */ +static void +metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) +{ + metaslab_t *msp = arg; + ASSERT(msp->ms_cur_ts != NULL); + range_tree_add(msp->ms_cur_ts->ts_tree, offset, size); + if (msp->ms_prev_ts != NULL) { + ASSERT(!range_tree_contains_part(msp->ms_prev_ts->ts_tree, + offset, size)); + } +} + +/* + * Does a metaslab's automatic trim operation processing. This must be + * called from metaslab_sync, with the txg number of the txg. This function + * issues trims in intervals as dictated by the zfs_txgs_per_trim tunable. + * If the previous trimset has not yet finished trimming, this function + * decides what to do based on `preserve_spilled'. If preserve_spilled is + * false, the next trimset which would have been issued is simply dropped to + * limit memory usage. Otherwise it is preserved by adding it to the cur_ts + * trimset. + */ +void +metaslab_auto_trim(metaslab_t *msp, uint64_t txg, boolean_t preserve_spilled) +{ + /* for atomicity */ + uint64_t txgs_per_trim = zfs_txgs_per_trim; + + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&msp->ms_lock); + + /* + * Since we typically have hundreds of metaslabs per vdev, but we only + * trim them once every zfs_txgs_per_trim txgs, it'd be best if we + * could sequence the TRIM commands from all metaslabs so that they + * don't all always pound the device in the same txg. We do so by + * artificially inflating the birth txg of the first trim set by a + * sequence number derived from the metaslab's starting offset + * (modulo zfs_txgs_per_trim). Thus, for the default 200 metaslabs and + * 32 txgs per trim, we'll only be trimming ~6.25 metaslabs per txg. + * + * If we detect that the txg has advanced too far ahead of ts_birth, + * it means our birth txg is out of lockstep. Recompute it by + * rounding down to the nearest zfs_txgs_per_trim multiple and adding + * our metaslab id modulo zfs_txgs_per_trim. + */ + if (txg > msp->ms_cur_ts->ts_birth + txgs_per_trim) { + msp->ms_cur_ts->ts_birth = (txg / txgs_per_trim) * + txgs_per_trim + (msp->ms_id % txgs_per_trim); + } + + /* Time to swap out the current and previous trimsets */ + if (txg == msp->ms_cur_ts->ts_birth + txgs_per_trim) { + if (msp->ms_prev_ts != NULL) { + if (msp->ms_trimming_ts != NULL) { + spa_t *spa = msp->ms_group->mg_class->mc_spa; + /* + * The previous trim run is still ongoing, so + * the device is reacting slowly to our trim + * requests. Drop this trimset, so as not to + * back the device up with trim requests. + */ + if (preserve_spilled) { + DTRACE_PROBE1(preserve__spilled, + metaslab_t *, msp); + range_tree_vacate( + msp->ms_prev_ts->ts_tree, + range_tree_add, + msp->ms_cur_ts->ts_tree); + } else { + DTRACE_PROBE1(drop__spilled, + metaslab_t *, msp); + spa_trimstats_auto_slow_incr(spa); + } + metaslab_free_trimset(msp->ms_prev_ts); + } else if (msp->ms_group->mg_vd->vdev_man_trimming) { + /* + * If a manual trim is ongoing, we want to + * inhibit autotrim temporarily so it doesn't + * slow down the manual trim. + */ + metaslab_free_trimset(msp->ms_prev_ts); + } else { + /* + * Trim out aged extents on the vdevs - these + * are safe to be destroyed now. We'll keep + * the trimset around to deny allocations from + * these regions while the trims are ongoing. + */ + zio_nowait(metaslab_exec_trim(msp, B_TRUE)); + } + } + msp->ms_prev_ts = msp->ms_cur_ts; + msp->ms_cur_ts = metaslab_new_trimset(txg, &msp->ms_lock); + } + mutex_exit(&msp->ms_lock); +} + +/* + * Computes the amount of memory a trimset is expected to use if issued out + * to be trimmed. The calculation isn't 100% accurate, because we don't + * know how the trimset's extents might subdivide into smaller extents + * (dkioc_free_list_ext_t) that actually get passed to the zio, but luckily + * the extent structure is fairly small compared to the size of a zio_t, so + * it's less important that we get that absolutely correct. We just want to + * get it "close enough". + */ +static uint64_t +metaslab_trimset_mem_used(metaslab_trimset_t *ts) +{ + uint64_t result = 0; + + result += avl_numnodes(&ts->ts_tree->rt_root) * (sizeof (range_seg_t) + + sizeof (dkioc_free_list_ext_t)); + result += ((range_tree_space(ts->ts_tree) / zfs_max_bytes_per_trim) + + 1) * sizeof (zio_t); + result += sizeof (range_tree_t) + sizeof (metaslab_trimset_t); + + return (result); +} + +/* + * Computes the amount of memory used by the trimsets and queued trim zios of + * a metaslab. + */ +uint64_t +metaslab_trim_mem_used(metaslab_t *msp) +{ + uint64_t result = 0; + + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&msp->ms_lock); + result += metaslab_trimset_mem_used(msp->ms_cur_ts); + if (msp->ms_prev_ts != NULL) + result += metaslab_trimset_mem_used(msp->ms_prev_ts); + mutex_exit(&msp->ms_lock); + + return (result); +} + +static void +metaslab_trim_done(zio_t *zio) +{ + metaslab_t *msp = zio->io_private; + boolean_t held; + + ASSERT(msp != NULL); + ASSERT(msp->ms_trimming_ts != NULL); + held = MUTEX_HELD(&msp->ms_lock); + if (!held) + mutex_enter(&msp->ms_lock); + metaslab_free_trimset(msp->ms_trimming_ts); + msp->ms_trimming_ts = NULL; + cv_broadcast(&msp->ms_trim_cv); + if (!held) + mutex_exit(&msp->ms_lock); +} + +/* + * Executes a zio_trim on a range tree holding freed extents in the metaslab. + * The set of extents is taken from the metaslab's ms_prev_ts. If there is + * another trim currently executing on that metaslab, this function blocks + * until that trim completes. + * The `auto_trim' argument signals whether the trim is being invoked on + * behalf of auto or manual trim. The differences are: + * 1) For auto trim the trimset is split up into zios of no more than + * zfs_max_bytes_per_trim bytes. Manual trim already does this + * earlier, so the whole trimset is issued in a single zio. + * 2) The zio(s) generated are tagged with either ZIO_PRIORITY_AUTO_TRIM or + * ZIO_PRIORITY_MAN_TRIM to allow differentiating them further down + * the pipeline (see zio_priority_t in sys/zio_priority.h). + * The function always returns a zio that the caller should zio_(no)wait. + */ +static zio_t * +metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_class->mc_spa; + vdev_t *vd = mg->mg_vd; + range_tree_t *trim_tree; + const uint64_t max_bytes = zfs_max_bytes_per_trim; + const enum zio_flag trim_flags = ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CONFIG_WRITER; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* wait for a preceding trim to finish */ + while (msp->ms_trimming_ts != NULL) + cv_wait(&msp->ms_trim_cv, &msp->ms_lock); + msp->ms_trimming_ts = msp->ms_prev_ts; + msp->ms_prev_ts = NULL; + trim_tree = msp->ms_trimming_ts->ts_tree; +#ifdef DEBUG + if (msp->ms_loaded) { + for (range_seg_t *rs = avl_first(&trim_tree->rt_root); + rs != NULL; rs = AVL_NEXT(&trim_tree->rt_root, rs)) { + if (!range_tree_contains_part(msp->ms_tree, + rs->rs_start, rs->rs_end - rs->rs_start)) { + panic("trimming allocated region; rs=%p", + (void*)rs); + } + } + } +#endif + + /* Nothing to trim */ + if (range_tree_space(trim_tree) == 0) { + metaslab_free_trimset(msp->ms_trimming_ts); + msp->ms_trimming_ts = 0; + return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); + } + + if (auto_trim) { + uint64_t start = 0; + range_seg_t *rs; + range_tree_t *sub_trim_tree = range_tree_create(NULL, NULL, + &msp->ms_lock); + zio_t *pio = zio_null(NULL, spa, vd, metaslab_trim_done, msp, + 0); + + rs = avl_first(&trim_tree->rt_root); + if (rs != NULL) + start = rs->rs_start; + while (rs != NULL) { + uint64_t end = MIN(rs->rs_end, start + (max_bytes - + range_tree_space(sub_trim_tree))); + + ASSERT3U(start, <=, end); + if (start == end) { + rs = AVL_NEXT(&trim_tree->rt_root, rs); + if (rs != NULL) + start = rs->rs_start; + continue; + } + range_tree_add(sub_trim_tree, start, end - start); + ASSERT3U(range_tree_space(sub_trim_tree), <=, + max_bytes); + if (range_tree_space(sub_trim_tree) == max_bytes) { + zio_nowait(zio_trim_tree(pio, spa, vd, + sub_trim_tree, auto_trim, NULL, NULL, + trim_flags, msp)); + range_tree_vacate(sub_trim_tree, NULL, NULL); + } + start = end; + } + if (range_tree_space(sub_trim_tree) != 0) { + zio_nowait(zio_trim_tree(pio, spa, vd, sub_trim_tree, + auto_trim, NULL, NULL, trim_flags, msp)); + range_tree_vacate(sub_trim_tree, NULL, NULL); + } + range_tree_destroy(sub_trim_tree); + + return (pio); + } else { + return (zio_trim_tree(NULL, spa, vd, trim_tree, auto_trim, + metaslab_trim_done, msp, trim_flags, msp)); + } +} + +/* + * Allocates and initializes a new trimset structure. The `txg' argument + * indicates when this trimset was born and `lock' indicates the lock to + * link to the range tree. + */ +static metaslab_trimset_t * +metaslab_new_trimset(uint64_t txg, kmutex_t *lock) +{ + metaslab_trimset_t *ts; + + ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); + ts->ts_birth = txg; + ts->ts_tree = range_tree_create(NULL, NULL, lock); + + return (ts); +} + +/* + * Destroys and frees a trim set previously allocated by metaslab_new_trimset. + */ +static void +metaslab_free_trimset(metaslab_trimset_t *ts) +{ + range_tree_vacate(ts->ts_tree, NULL, NULL); + range_tree_destroy(ts->ts_tree); + kmem_free(ts, sizeof (*ts)); +} + +/* + * Checks whether an allocation conflicts with an ongoing trim operation in + * the given metaslab. This function takes a segment starting at `*offset' + * of `size' and checks whether it hits any region in the metaslab currently + * being trimmed. If yes, it tries to adjust the allocation to the end of + * the region being trimmed (P2ROUNDUP aligned by `align'), but only up to + * `limit' (no part of the allocation is allowed to go past this point). + * + * Returns B_FALSE if either the original allocation wasn't in conflict, or + * the conflict could be resolved by adjusting the value stored in `offset' + * such that the whole allocation still fits below `limit'. Returns B_TRUE + * if the allocation conflict couldn't be resolved. + */ +static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, + uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit) +{ + uint64_t new_offset; + + ASSERT3U(*offset + size, <=, limit); + + if (msp->ms_trimming_ts == NULL) + /* no trim conflict, original offset is OK */ + return (B_FALSE); + + new_offset = P2ROUNDUP(range_tree_find_gap(msp->ms_trimming_ts->ts_tree, + *offset, size), align); + if (new_offset + size > limit) + /* trim conflict and adjustment not possible */ + return (B_TRUE); + + /* trim conflict, but adjusted offset still within limit */ + *offset = new_offset; + return (B_FALSE); +} + #if defined(_KERNEL) /* BEGIN CSTYLED */ module_param(metaslab_aliquot, ulong, 0644); @@ -4340,6 +4878,10 @@ module_param(zfs_metaslab_switch_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_switch_threshold, "segment-based metaslab selection maximum buckets before switching"); +module_param(zfs_txgs_per_trim, int, 0644); +MODULE_PARM_DESC(zfs_txgs_per_trim, + "txgs per trim"); + module_param(metaslab_force_ganging, ulong, 0644); MODULE_PARM_DESC(metaslab_force_ganging, "blocks larger than this size are forced to be gang blocks"); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 2181a92df5e3..2e11e481249e 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -24,6 +24,7 @@ */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -510,6 +511,21 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) return (NULL); } +/* + * Given an extent start offset and size, will look through the provided + * range tree and find a suitable start offset (starting at `start') such + * that the requested extent _doesn't_ overlap with any range segment in + * the range tree. + */ +uint64_t +range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs; + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) + start = rs->rs_end; + return (start); +} + void range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { @@ -526,6 +542,15 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) return (range_tree_find(rt, start, size) != NULL); } +/* + * Same as range_tree_contains, but locates even just a partial overlap. + */ +boolean_t +range_tree_contains_part(range_tree_t *rt, uint64_t start, uint64_t size) +{ + return (range_tree_find_impl(rt, start, size) != NULL); +} + /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bbe2f89629a5..12042ccd2d80 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -157,6 +157,10 @@ static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); +static void spa_auto_trim(spa_t *spa, uint64_t txg); +static void spa_vdev_man_trim_done(spa_t *spa); +static void spa_vdev_auto_trim_done(spa_t *spa); +static uint64_t spa_min_trim_rate(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ @@ -554,6 +558,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: + case ZPOOL_PROP_FORCETRIM: + case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); @@ -1436,6 +1442,16 @@ spa_unload(spa_t *spa) spa_load_note(spa, "UNLOADING"); + /* + * Stop manual trim before stopping spa sync, because manual trim + * needs to execute a synctask (trim timestamp sync) at the end. + */ + mutex_enter(&spa->spa_auto_trim_lock); + mutex_enter(&spa->spa_man_trim_lock); + spa_trim_stop_wait(spa); + mutex_exit(&spa->spa_man_trim_lock); + mutex_exit(&spa->spa_auto_trim_lock); + /* * Stop async tasks. */ @@ -1454,6 +1470,14 @@ spa_unload(spa_t *spa) spa->spa_sync_on = B_FALSE; } + /* + * Stop autotrim tasks. + */ + mutex_enter(&spa->spa_auto_trim_lock); + if (spa->spa_auto_trim_taskq) + spa_auto_trim_taskq_destroy(spa); + mutex_exit(&spa->spa_auto_trim_lock); + /* * Even though vdev_free() also calls vdev_metaslab_fini, we need * to call it earlier, before we wait for async i/o to complete. @@ -3504,10 +3528,22 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); + spa_prop_find(spa, ZPOOL_PROP_FORCETRIM, &spa->spa_force_trim); + + mutex_enter(&spa->spa_auto_trim_lock); + spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_auto_trim); + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) + spa_auto_trim_taskq_create(spa); + mutex_exit(&spa->spa_auto_trim_lock); spa->spa_autoreplace = (autoreplace != 0); } + (void) spa_dir_prop(spa, DMU_POOL_TRIM_START_TIME, + &spa->spa_man_trim_start_time); + (void) spa_dir_prop(spa, DMU_POOL_TRIM_STOP_TIME, + &spa->spa_man_trim_stop_time); + /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on @@ -5257,6 +5293,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); + spa->spa_force_trim = zpool_prop_default_numeric(ZPOOL_PROP_FORCETRIM); + + mutex_enter(&spa->spa_auto_trim_lock); + spa->spa_auto_trim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) + spa_auto_trim_taskq_create(spa); + mutex_exit(&spa->spa_auto_trim_lock); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); @@ -6011,6 +6054,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + vdev_trim_stop_wait(oldvd->vdev_top); + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. @@ -6211,6 +6256,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + vdev_trim_stop_wait(vd->vdev_top); + ASSERT(pvd->vdev_children >= 2); /* @@ -6565,6 +6612,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + vdev_trim_stop_wait(rvd); + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); @@ -7051,6 +7100,8 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); + vdev_trim_stop_wait(vd); + /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away @@ -7190,6 +7241,12 @@ spa_async_thread(void *arg) mutex_exit(&spa_namespace_lock); } + if (tasks & SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY) { + mutex_enter(&spa->spa_man_trim_lock); + spa_man_trim_taskq_destroy(spa); + mutex_exit(&spa->spa_man_trim_lock); + } + /* * Let the world know that we're done. */ @@ -7280,6 +7337,15 @@ spa_async_request(spa_t *spa, int task) mutex_exit(&spa->spa_async_lock); } +void +spa_async_unrequest(spa_t *spa, int task) +{ + zfs_dbgmsg("spa=%s async unrequest task=%u", spa->spa_name, task); + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~task; + mutex_exit(&spa->spa_async_lock); +} + /* * ========================================================================== * SPA syncing routines @@ -7689,6 +7755,21 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; + case ZPOOL_PROP_FORCETRIM: + spa->spa_force_trim = intval; + break; + case ZPOOL_PROP_AUTOTRIM: + mutex_enter(&spa->spa_auto_trim_lock); + if (intval != spa->spa_auto_trim) { + spa->spa_auto_trim = intval; + if (intval != 0) + spa_auto_trim_taskq_create(spa); + else + spa_auto_trim_taskq_destroy( + spa); + } + mutex_exit(&spa->spa_auto_trim_lock); + break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) @@ -8062,6 +8143,10 @@ spa_sync(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_alloc_locks[i]); } + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) + spa_auto_trim(spa, txg); + + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -8406,6 +8491,275 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } +/* + * Dispatches all auto-trim processing to all top-level vdevs. This is + * called from spa_sync once every txg. + */ +static void +spa_auto_trim(spa_t *spa, uint64_t txg) +{ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER) == SCL_CONFIG); + ASSERT(!MUTEX_HELD(&spa->spa_auto_trim_lock)); + ASSERT(spa->spa_auto_trim_taskq != NULL); + + /* + * Another pool management task might be currently prevented from + * starting and the current txg sync was invoked on its behalf, + * so be prepared to postpone autotrim processing. + */ + if (!mutex_tryenter(&spa->spa_auto_trim_lock)) + return; + spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children; + mutex_exit(&spa->spa_auto_trim_lock); + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP); + vti->vti_vdev = spa->spa_root_vdev->vdev_child[i]; + vti->vti_txg = txg; + vti->vti_done_cb = (void (*)(void *))spa_vdev_auto_trim_done; + vti->vti_done_arg = spa; + (void) taskq_dispatch(spa->spa_auto_trim_taskq, + (void (*)(void *))vdev_auto_trim, vti, TQ_SLEEP); + } +} + +/* + * Performs the sync update of the MOS pool directory's trim start/stop values. + */ +static void +spa_trim_update_time_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TRIM_START_TIME, sizeof (uint64_t), 1, + &spa->spa_man_trim_start_time, tx)); + VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TRIM_STOP_TIME, sizeof (uint64_t), 1, + &spa->spa_man_trim_stop_time, tx)); +} + +/* + * Updates the in-core and on-disk manual TRIM operation start/stop time. + * Passing UINT64_MAX for either start_time or stop_time means that no + * update to that value should be recorded. + */ +static dmu_tx_t * +spa_trim_update_time(spa_t *spa, uint64_t start_time, uint64_t stop_time) +{ + int err; + dmu_tx_t *tx; + + ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock)); + if (start_time != UINT64_MAX) + spa->spa_man_trim_start_time = start_time; + if (stop_time != UINT64_MAX) + spa->spa_man_trim_stop_time = stop_time; + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (NULL); + } + dsl_sync_task_nowait(spa_get_dsl(spa), spa_trim_update_time_sync, + spa, 1, ZFS_SPACE_CHECK_RESERVED, tx); + + return (tx); +} + +/* + * Initiates an manual TRIM of the whole pool. This kicks off individual + * TRIM tasks for each top-level vdev, which then pass over all of the free + * space in all of the vdev's metaslabs and issues TRIM commands for that + * space to the underlying vdevs. + */ +extern void +spa_man_trim(spa_t *spa, uint64_t rate) +{ + dmu_tx_t *time_update_tx; + + mutex_enter(&spa->spa_man_trim_lock); + + if (rate != 0) + spa->spa_man_trim_rate = MAX(rate, spa_min_trim_rate(spa)); + else + spa->spa_man_trim_rate = 0; + + if (spa->spa_num_man_trimming) { + /* + * TRIM is already ongoing. Wake up all sleeping vdev trim + * threads because the trim rate might have changed above. + */ + cv_broadcast(&spa->spa_man_trim_update_cv); + mutex_exit(&spa->spa_man_trim_lock); + return; + } + spa_man_trim_taskq_create(spa); + spa->spa_man_trim_stop = B_FALSE; + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_TRIM_START); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP); + vti->vti_vdev = vd; + vti->vti_done_cb = (void (*)(void *))spa_vdev_man_trim_done; + vti->vti_done_arg = spa; + spa->spa_num_man_trimming++; + + vd->vdev_trim_prog = 0; + (void) taskq_dispatch(spa->spa_man_trim_taskq, + (void (*)(void *))vdev_man_trim, vti, TQ_SLEEP); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); + time_update_tx = spa_trim_update_time(spa, gethrestime_sec(), 0); + mutex_exit(&spa->spa_man_trim_lock); + /* mustn't hold spa_man_trim_lock to prevent deadlock /w syncing ctx */ + if (time_update_tx != NULL) + dmu_tx_commit(time_update_tx); +} + +/* + * Orders a manual TRIM operation to stop and returns immediately. + */ +extern void +spa_man_trim_stop(spa_t *spa) +{ + boolean_t held = MUTEX_HELD(&spa->spa_man_trim_lock); + if (!held) + mutex_enter(&spa->spa_man_trim_lock); + spa->spa_man_trim_stop = B_TRUE; + cv_broadcast(&spa->spa_man_trim_update_cv); + if (!held) + mutex_exit(&spa->spa_man_trim_lock); +} + +/* + * Orders a manual TRIM operation to stop and waits for both manual and + * automatic TRIM to complete. By holding both the spa_man_trim_lock and + * the spa_auto_trim_lock, the caller can guarantee that after this + * function returns, no new TRIM operations can be initiated in parallel. + */ +void +spa_trim_stop_wait(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock)); + ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock)); + spa->spa_man_trim_stop = B_TRUE; + cv_broadcast(&spa->spa_man_trim_update_cv); + while (spa->spa_num_man_trimming > 0) + cv_wait(&spa->spa_man_trim_done_cv, &spa->spa_man_trim_lock); + while (spa->spa_num_auto_trimming > 0) + cv_wait(&spa->spa_auto_trim_done_cv, &spa->spa_auto_trim_lock); +} + +/* + * Returns manual TRIM progress. Progress is indicated by four return values: + * 1) prog: the number of bytes of space on the pool in total that manual + * TRIM has already passed (regardless if the space is allocated or not). + * Completion of the operation is indicated when either the returned value + * is zero, or when the returned value is equal to the sum of the sizes of + * all top-level vdevs. + * 2) rate: the trim rate in bytes per second. A value of zero indicates that + * trim progresses as fast as possible. + * 3) start_time: the UNIXTIME of when the last manual TRIM operation was + * started. If no manual trim was ever initiated on the pool, this is + * zero. + * 4) stop_time: the UNIXTIME of when the last manual TRIM operation has + * stopped on the pool. If a trim was started (start_time != 0), but has + * not yet completed, stop_time will be zero. If a trim is NOT currently + * ongoing and start_time is non-zero, this indicates that the previously + * initiated TRIM operation was interrupted. + */ +extern void +spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate, + uint64_t *start_time, uint64_t *stop_time) +{ + uint64_t total = 0; + vdev_t *root_vd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + mutex_enter(&spa->spa_man_trim_lock); + if (spa->spa_num_man_trimming > 0) { + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + total += root_vd->vdev_child[i]->vdev_trim_prog; + } + } + *prog = total; + *rate = spa->spa_man_trim_rate; + *start_time = spa->spa_man_trim_start_time; + *stop_time = spa->spa_man_trim_stop_time; + mutex_exit(&spa->spa_man_trim_lock); +} + +/* + * Callback when a vdev_man_trim has finished on a single top-level vdev. + */ +static void +spa_vdev_man_trim_done(spa_t *spa) +{ + dmu_tx_t *time_update_tx = NULL; + + mutex_enter(&spa->spa_man_trim_lock); + ASSERT(spa->spa_num_man_trimming > 0); + spa->spa_num_man_trimming--; + if (spa->spa_num_man_trimming == 0) { + /* if we were interrupted, leave stop_time at zero */ + if (!spa->spa_man_trim_stop) + time_update_tx = spa_trim_update_time(spa, UINT64_MAX, + gethrestime_sec()); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_TRIM_FINISH); + spa_async_request(spa, SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY); + cv_broadcast(&spa->spa_man_trim_done_cv); + } + mutex_exit(&spa->spa_man_trim_lock); + + if (time_update_tx != NULL) + dmu_tx_commit(time_update_tx); +} + +/* + * Called from vdev_auto_trim when a vdev has completed its auto-trim + * processing. + */ +static void +spa_vdev_auto_trim_done(spa_t *spa) +{ + mutex_enter(&spa->spa_auto_trim_lock); + ASSERT(spa->spa_num_auto_trimming > 0); + spa->spa_num_auto_trimming--; + if (spa->spa_num_auto_trimming == 0) + cv_broadcast(&spa->spa_auto_trim_done_cv); + mutex_exit(&spa->spa_auto_trim_lock); +} + +/* + * Determines the minimum sensible rate at which a manual TRIM can be + * performed on a given spa and returns it. Since we perform TRIM in + * metaslab-sized increments, we'll just let the longest step between + * metaslab TRIMs be 100s (random number, really). Thus, on a typical + * 200-metaslab vdev, the longest TRIM should take is about 5.5 hours. + * It *can* take longer if the device is really slow respond to + * zio_trim() commands or it contains more than 200 metaslabs, or + * metaslab sizes vary widely between top-level vdevs. + */ +static uint64_t +spa_min_trim_rate(spa_t *spa) +{ + uint64_t i, smallest_ms_sz = UINT64_MAX; + + /* find the smallest metaslab */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + smallest_ms_sz = MIN(smallest_ms_sz, + spa->spa_root_vdev->vdev_child[i]->vdev_ms[0]->ms_size); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); + VERIFY(smallest_ms_sz != 0); + + /* minimum TRIM rate is 1/100th of the smallest metaslab size */ + return (smallest_ms_sz / 100); +} + #if defined(_KERNEL) /* state manipulation functions */ EXPORT_SYMBOL(spa_open); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 8616abda37bd..d098008ad99e 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -21,9 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include @@ -509,6 +509,19 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); + /* If we're getting stats, calculate trim progress from leaf vdevs. */ + if (getstats) { + uint64_t prog, rate, start_time, stop_time; + + spa_get_trim_prog(spa, &prog, &rate, &start_time, &stop_time); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TRIM_PROG, prog); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TRIM_RATE, rate); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TRIM_START_TIME, + start_time); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TRIM_STOP_TIME, + stop_time); + } + /* * Store what's necessary for reading the MOS in the label. */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 877f312b1b3e..aa6192dd54fa 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. @@ -228,6 +228,22 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ +struct spa_trimstats { + kstat_named_t st_extents; /* # of extents issued to zio */ + kstat_named_t st_bytes; /* # of bytes issued to zio */ + kstat_named_t st_extents_skipped; /* # of extents too small */ + kstat_named_t st_bytes_skipped; /* bytes in extents_skipped */ + kstat_named_t st_auto_slow; /* trim slow, exts dropped */ +}; + +static spa_trimstats_t spa_trimstats_template = { + { "extents", KSTAT_DATA_UINT64 }, + { "bytes", KSTAT_DATA_UINT64 }, + { "extents_skipped", KSTAT_DATA_UINT64 }, + { "bytes_skipped", KSTAT_DATA_UINT64 }, + { "auto_slow", KSTAT_DATA_UINT64 }, +}; + static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; @@ -420,6 +436,14 @@ int zfs_user_indirect_is_special = B_TRUE; */ int zfs_special_class_metadata_reserve_pct = 25; +/* + * Percentage of the number of CPUs to use as the autotrim taskq thread count. + */ +int zfs_auto_trim_taskq_batch_pct = 75; + +static void spa_trimstats_create(spa_t *spa); +static void spa_trimstats_destroy(spa_t *spa); + /* * ========================================================================== * SPA config locking @@ -639,12 +663,17 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_auto_trim_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_man_trim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_auto_trim_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_man_trim_update_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_man_trim_done_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); @@ -715,6 +744,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) KM_SLEEP) == 0); } + spa_trimstats_create(spa); + spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; @@ -783,6 +814,8 @@ spa_remove(spa_t *spa) spa_stats_destroy(spa); spa_config_lock_destroy(spa); + spa_trimstats_destroy(spa); + for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); @@ -793,6 +826,9 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + cv_destroy(&spa->spa_auto_trim_done_cv); + cv_destroy(&spa->spa_man_trim_update_cv); + cv_destroy(&spa->spa_man_trim_done_cv); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); @@ -806,6 +842,8 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); + mutex_destroy(&spa->spa_auto_trim_lock); + mutex_destroy(&spa->spa_man_trim_lock); kmem_free(spa, sizeof (spa_t)); } @@ -1124,6 +1162,9 @@ spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + mutex_enter(&spa->spa_auto_trim_lock); + mutex_enter(&spa->spa_man_trim_lock); + spa_trim_stop_wait(spa); return (spa_vdev_config_enter(spa)); } @@ -1224,6 +1265,8 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); + mutex_exit(&spa->spa_man_trim_lock); + mutex_exit(&spa->spa_auto_trim_lock); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); @@ -1919,6 +1962,18 @@ spa_deadman_synctime(spa_t *spa) return (spa->spa_deadman_synctime); } +spa_force_trim_t +spa_get_force_trim(spa_t *spa) +{ + return (spa->spa_force_trim); +} + +spa_auto_trim_t +spa_get_auto_trim(spa_t *spa) +{ + return (spa->spa_auto_trim); +} + uint64_t spa_deadman_ziotime(spa_t *spa) { @@ -2437,6 +2492,185 @@ spa_suspend_async_destroy(spa_t *spa) return (B_FALSE); } +int +spa_trimstats_kstat_update(kstat_t *ksp, int rw) +{ + spa_t *spa; + spa_trimstats_t *trimstats; + int i; + + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) { + spa = ksp->ks_private; + trimstats = spa->spa_trimstats; + for (i = 0; i < sizeof (spa_trimstats_t) / + sizeof (kstat_named_t); ++i) + ((kstat_named_t *)trimstats)[i].value.ui64 = 0; + } + return (0); +} + +/* + * Creates the trim kstats structure for a spa. + */ +static void +spa_trimstats_create(spa_t *spa) +{ + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + if (spa->spa_name[0] == '$') + return; + + ASSERT3P(spa->spa_trimstats, ==, NULL); + ASSERT3P(spa->spa_trimstats_ks, ==, NULL); + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + ksp = kstat_create(name, 0, "trimstats", "misc", + KSTAT_TYPE_NAMED, sizeof (spa_trimstats_template) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (ksp != NULL) { + ksp->ks_private = spa; + ksp->ks_update = spa_trimstats_kstat_update; + spa->spa_trimstats_ks = ksp; + spa->spa_trimstats = + kmem_alloc(sizeof (spa_trimstats_t), KM_SLEEP); + *spa->spa_trimstats = spa_trimstats_template; + spa->spa_trimstats_ks->ks_data = spa->spa_trimstats; + kstat_install(spa->spa_trimstats_ks); + } else { + cmn_err(CE_NOTE, "!Cannot create trim kstats for pool %s", + spa->spa_name); + } +} + +/* + * Destroys the trim kstats for a spa. + */ +static void +spa_trimstats_destroy(spa_t *spa) +{ + if (spa->spa_trimstats_ks) { + kstat_delete(spa->spa_trimstats_ks); + kmem_free(spa->spa_trimstats, sizeof (spa_trimstats_t)); + spa->spa_trimstats_ks = NULL; + } +} + +/* + * Updates the numerical trim kstats for a spa. + */ +void +spa_trimstats_update(spa_t *spa, uint64_t extents, uint64_t bytes, + uint64_t extents_skipped, uint64_t bytes_skipped) +{ + spa_trimstats_t *st = spa->spa_trimstats; + if (st) { + atomic_add_64(&st->st_extents.value.ui64, extents); + atomic_add_64(&st->st_bytes.value.ui64, bytes); + atomic_add_64(&st->st_extents_skipped.value.ui64, + extents_skipped); + atomic_add_64(&st->st_bytes_skipped.value.ui64, + bytes_skipped); + } +} + +/* + * Increments the slow-trim kstat for a spa. + */ +void +spa_trimstats_auto_slow_incr(spa_t *spa) +{ + spa_trimstats_t *st = spa->spa_trimstats; + if (st) + atomic_inc_64(&st->st_auto_slow.value.ui64); +} + +/* + * Creates the taskq used for dispatching auto-trim. This is called only when + * the property is set to `on' or when the pool is loaded (and the autotrim + * property is `on'). + */ +void +spa_auto_trim_taskq_create(spa_t *spa) +{ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock)); + ASSERT(spa->spa_auto_trim_taskq == NULL); + (void) snprintf(name, MAXPATHLEN, "%s_auto_trim", spa->spa_name); + spa->spa_auto_trim_taskq = taskq_create(name, + zfs_auto_trim_taskq_batch_pct, minclsyspri, 1, INT_MAX, + TASKQ_THREADS_CPU_PCT); + VERIFY(spa->spa_auto_trim_taskq != NULL); + kmem_free(name, MAXPATHLEN); +} + +/* + * Creates the taskq for dispatching manual trim. This taskq is recreated + * each time `zpool trim ' is issued and destroyed after the run + * completes in an async spa request. + */ +void +spa_man_trim_taskq_create(spa_t *spa) +{ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock)); + spa_async_unrequest(spa, SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY); + if (spa->spa_man_trim_taskq != NULL) { + /* + * The async taskq destroy has been pre-empted, so just + * return, the taskq is still good to use. + */ + return; + } + (void) snprintf(name, MAXPATHLEN, "%s_man_trim", spa->spa_name); + spa->spa_man_trim_taskq = taskq_create(name, + spa->spa_root_vdev->vdev_children, minclsyspri, + spa->spa_root_vdev->vdev_children, + spa->spa_root_vdev->vdev_children, TASKQ_PREPOPULATE); + VERIFY(spa->spa_man_trim_taskq != NULL); + kmem_free(name, MAXPATHLEN); +} + +/* + * Destroys the taskq created in spa_auto_trim_taskq_create. The taskq + * is only destroyed when the autotrim property is set to `off'. + */ +void +spa_auto_trim_taskq_destroy(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock)); + ASSERT(spa->spa_auto_trim_taskq != NULL); + while (spa->spa_num_auto_trimming != 0) + cv_wait(&spa->spa_auto_trim_done_cv, &spa->spa_auto_trim_lock); + taskq_destroy(spa->spa_auto_trim_taskq); + spa->spa_auto_trim_taskq = NULL; +} + +/* + * Destroys the taskq created in spa_man_trim_taskq_create. The taskq is + * destroyed after a manual trim run completes from an async spa request. + * There is a bit of lag between an async request being issued at the + * completion of a trim run and it finally being acted on, hence why this + * function checks if new manual trimming threads haven't been re-spawned. + * If they have, we assume the async spa request been preempted by another + * manual trim request and we back off. + */ +void +spa_man_trim_taskq_destroy(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock)); + ASSERT(spa->spa_man_trim_taskq != NULL); + if (spa->spa_num_man_trimming != 0) + /* another trim got started before we got here, back off */ + return; + taskq_destroy(spa->spa_man_trim_taskq); + spa->spa_man_trim_taskq = NULL; +} + #if defined(_KERNEL) #include @@ -2664,5 +2898,11 @@ MODULE_PARM_DESC(zfs_ddt_data_is_special, module_param(zfs_user_indirect_is_special, int, 0644); MODULE_PARM_DESC(zfs_user_indirect_is_special, "Place user data indirect blocks into the special class"); + +module_param(zfs_auto_trim_taskq_batch_pct, int, 0644); +MODULE_PARM_DESC(zfs_auto_trim_taskq_batch_pct, + "Percentage of the number of CPUs to use as the autotrim taskq" + " thread count"); + /* END CSTYLED */ #endif diff --git a/module/zfs/trace.c b/module/zfs/trace.c index eb6efe841cbd..7b5d4f82e1da 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 64fc6fadd427..58e60ef003e3 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -27,6 +27,7 @@ * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -53,6 +54,7 @@ #include #include #include +#include /* target number of metaslabs per top-level vdev */ int vdev_max_ms_count = 200; @@ -198,6 +200,15 @@ static vdev_ops_t *vdev_ops_table[] = { NULL }; +/* + * If we accumulate a lot of trim extents due to trim running slow, this + * is the memory pressure valve. We limit the amount of memory consumed + * by the extents in memory to physmem/zfs_trim_mem_lim_fact (by default + * 2%). If we exceed this limit, we start throwing out new extents + * without queueing them. + */ +int zfs_trim_mem_lim_fact = 50; + /* * Given a vdev type, return the appropriate ops vector. */ @@ -551,6 +562,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vdev_queue_init(vd); vdev_cache_init(vd); + mutex_init(&vd->vdev_trim_zios_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_trim_zios_cv, NULL, CV_DEFAULT, NULL); + return (vd); } @@ -975,6 +989,9 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); + ASSERT0(vd->vdev_trim_zios); + mutex_destroy(&vd->vdev_trim_zios_lock); + cv_destroy(&vd->vdev_trim_zios_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -2352,6 +2369,23 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } +boolean_t +vdev_is_dirty(vdev_t *vd, int flags, void *arg) +{ + ASSERT(vd == vd->vdev_top); + ASSERT(!vd->vdev_ishole); + ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); + ASSERT3U(flags, ==, VDD_METASLAB); + + for (uint64_t txg = 0; txg < TXG_SIZE; txg++) { + if (txg_list_member(&vd->vdev_ms_list, arg, txg)) + return (B_TRUE); + } + + return (B_FALSE); +} + void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { @@ -4667,6 +4701,182 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) spa->spa_resilver_deferred = B_TRUE; } +/* + * Implements the per-vdev portion of manual TRIM. The function passes over + * all metaslabs on this vdev and performs a metaslab_trim_all on them. It's + * also responsible for rate-control if spa_man_trim_rate is non-zero. + */ +void +vdev_man_trim(vdev_trim_info_t *vti) +{ + clock_t t = ddi_get_lbolt(); + spa_t *spa = vti->vti_vdev->vdev_spa; + vdev_t *vd = vti->vti_vdev; + uint64_t i, cursor; + boolean_t was_loaded = B_FALSE; + + vd->vdev_man_trimming = B_TRUE; + vd->vdev_trim_prog = 0; + + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + ASSERT(vd->vdev_ms[0] != NULL); + cursor = vd->vdev_ms[0]->ms_start; + i = 0; + while (i < vti->vti_vdev->vdev_ms_count && !spa->spa_man_trim_stop) { + uint64_t delta; + metaslab_t *msp = vd->vdev_ms[i]; + zio_t *trim_io; + + trim_io = metaslab_trim_all(msp, &cursor, &delta, &was_loaded); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + + if (trim_io != NULL) { + ASSERT3U(cursor, >=, vd->vdev_ms[0]->ms_start); + vd->vdev_trim_prog = cursor - vd->vdev_ms[0]->ms_start; + (void) zio_wait(trim_io); + } else { + /* + * If there was nothing more left to trim, that means + * this metaslab is either done trimming, or we + * couldn't load it, move to the next one. + */ + i++; + if (i < vti->vti_vdev->vdev_ms_count) + ASSERT3U(vd->vdev_ms[i]->ms_start, ==, cursor); + } + + /* delay loop to handle fixed-rate trimming */ + for (;;) { + uint64_t rate = spa->spa_man_trim_rate; + uint64_t sleep_delay; + + if (rate == 0) { + /* No delay, just update 't' and move on. */ + t = ddi_get_lbolt(); + break; + } + + sleep_delay = (delta * hz) / rate; + mutex_enter(&spa->spa_man_trim_lock); + (void) cv_timedwait(&spa->spa_man_trim_update_cv, + &spa->spa_man_trim_lock, t); + mutex_exit(&spa->spa_man_trim_lock); + + /* If interrupted, don't try to relock, get out */ + if (spa->spa_man_trim_stop) + goto out; + + /* Timeout passed, move on to the next metaslab. */ + if (ddi_get_lbolt() >= t + sleep_delay) { + t += sleep_delay; + break; + } + } + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + } + spa_config_exit(spa, SCL_STATE_ALL, FTAG); +out: + /* + * Ensure we're marked as "completed" even if we've had to stop + * before processing all metaslabs. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_trim_prog = vd->vdev_stat.vs_space; + mutex_exit(&vd->vdev_stat_lock); + vd->vdev_man_trimming = B_FALSE; + + ASSERT(vti->vti_done_cb != NULL); + vti->vti_done_cb(vti->vti_done_arg); + + kmem_free(vti, sizeof (*vti)); +} + +/* + * Runs through all metaslabs on the vdev and does their autotrim processing. + */ +void +vdev_auto_trim(vdev_trim_info_t *vti) +{ + vdev_t *vd = vti->vti_vdev; + spa_t *spa = vd->vdev_spa; + uint64_t txg = vti->vti_txg; + uint64_t mlim = 0, mused = 0; + boolean_t limited; + + ASSERT3P(vd->vdev_top, ==, vd); + + if (vd->vdev_man_trimming) + goto out; + + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) + mused += metaslab_trim_mem_used(vd->vdev_ms[i]); + mlim = (physmem * PAGESIZE) / (zfs_trim_mem_lim_fact * + spa->spa_root_vdev->vdev_children); + limited = mused > mlim; + DTRACE_PROBE3(autotrim__mem__lim, vdev_t *, vd, uint64_t, mused, + uint64_t, mlim); + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) + metaslab_auto_trim(vd->vdev_ms[i], txg, !limited); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + +out: + ASSERT(vti->vti_done_cb != NULL); + vti->vti_done_cb(vti->vti_done_arg); + + kmem_free(vti, sizeof (*vti)); +} + +static void +trim_stop_set(vdev_t *vd, boolean_t flag) +{ + mutex_enter(&vd->vdev_trim_zios_lock); + vd->vdev_trim_zios_stop = flag; + mutex_exit(&vd->vdev_trim_zios_lock); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + trim_stop_set(vd->vdev_child[i], flag); +} + +static void +trim_stop_wait(vdev_t *vd) +{ + mutex_enter(&vd->vdev_trim_zios_lock); + while (vd->vdev_trim_zios) + cv_wait(&vd->vdev_trim_zios_cv, &vd->vdev_trim_zios_lock); + mutex_exit(&vd->vdev_trim_zios_lock); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + trim_stop_wait(vd->vdev_child[i]); +} + +/* + * This function stops all asynchronous trim I/O going to a vdev and all + * its children. Because trim zios occur outside of the normal transactional + * machinery, we can't rely on the DMU hooks to stop I/O to devices being + * removed or reconfigured. Therefore, all pool management tasks which + * change the vdev configuration need to stop trim I/Os explicitly. + * After this function returns, it is guaranteed that no trim zios will be + * executing on the vdev or any of its children until either of the + * trim locks is released. + */ +void +vdev_trim_stop_wait(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_spa->spa_man_trim_lock)); + ASSERT(MUTEX_HELD(&vd->vdev_spa->spa_auto_trim_lock)); + /* + * First we mark all devices as requesting a trim stop. This starts + * the vdev queue drain (via zio_trim_should_bypass) quickly, then + * we actually wait for all trim zios to get destroyed and then we + * unmark the stop condition so trim zios can configure once the + * pool management operation is done. + */ + trim_stop_set(vd, B_TRUE); + trim_stop_wait(vd); + trim_stop_set(vd, B_FALSE); +} + #if defined(_KERNEL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); @@ -4705,5 +4915,9 @@ MODULE_PARM_DESC(vdev_validate_skip, module_param(zfs_nocacheflush, int, 0644); MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); + +module_param(zfs_trim_mem_lim_fact, int, 0644); +MODULE_PARM_DESC(metaslabs_per_vdev, "Maximum percentage of physical memory " + "to be used for storing trim extents"); /* END CSTYLED */ #endif diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index d13f365dd055..3cbb5b956c68 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -24,6 +24,7 @@ * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include @@ -35,6 +36,7 @@ #include #include #include +#include char *zfs_vdev_scheduler = VDEV_SCHEDULER; static void *zfs_vdev_holder = VDEV_HOLDER; @@ -322,6 +324,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, } else { vd->vd_bdev = bdev; v->vdev_tsd = vd; + /* Reset TRIM flag, as underlying device support may have changed */ + v->vdev_notrim = B_FALSE; rw_exit(&vd->vd_lock); } @@ -751,6 +755,55 @@ vdev_disk_io_start(zio_t *zio) break; + case DKIOCFREE: + { + dkioc_free_list_t *dfl; + + if (!zfs_trim) + break; + + /* + * We perform device support checks here instead of + * in zio_trim_*(), as zio_trim_*() might be invoked + * on a top-level vdev, whereas vdev_disk_io_start + * is guaranteed to be operating a leaf disk vdev. + */ + if (v->vdev_notrim && + spa_get_force_trim(v->vdev_spa) != + SPA_FORCE_TRIM_ON) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + /* + * zio->io_dfl contains a dkioc_free_list_t + * specifying which offsets are to be freed + */ + dfl = zio->io_dfl; + ASSERT(dfl != NULL); + + for (int i = 0; i < dfl->dfl_num_exts; i++) { + int error; + + if (dfl->dfl_exts[i].dfle_length == 0) + continue; + + error = -blkdev_issue_discard(vd->vd_bdev, + (dfl->dfl_exts[i].dfle_start + + dfl->dfl_offset) >> 9, + dfl->dfl_exts[i].dfle_length >> 9, + GFP_NOFS, 0); + + if (error != 0) { + if (error == EOPNOTSUPP || + error == ENXIO) + v->vdev_notrim = B_TRUE; + zio->io_error = SET_ERROR(error); + break; + } + } + break; + } default: zio->io_error = SET_ERROR(ENOTSUP); } @@ -880,19 +933,16 @@ param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) } vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - NULL, - vdev_disk_hold, - vdev_disk_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 3551898e0781..8486ce2b259a 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -32,6 +33,9 @@ #include #include #include +#include +#include +#include /* * Virtual device vector for files. @@ -223,6 +227,37 @@ vdev_file_io_start(zio_t *zio) zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL); break; + + case DKIOCFREE: + { + const dkioc_free_list_t *dfl = zio->io_dfl; + + ASSERT(dfl != NULL); + if (!zfs_trim) + break; + for (int i = 0; i < dfl->dfl_num_exts; i++) { + struct flock flck; + int error; + + if (dfl->dfl_exts[i].dfle_length == 0) + continue; + + bzero(&flck, sizeof (flck)); + flck.l_type = F_FREESP; + flck.l_start = dfl->dfl_exts[i].dfle_start + + dfl->dfl_offset; + flck.l_len = dfl->dfl_exts[i].dfle_length; + flck.l_whence = 0; + + error = VOP_SPACE(vf->vf_vnode, + F_FREESP, &flck, 0, 0, kcred, NULL); + if (error != 0) { + zio->io_error = SET_ERROR(error); + break; + } + } + break; + } default: zio->io_error = SET_ERROR(ENOTSUP); } @@ -244,19 +279,19 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_trim = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; void @@ -280,19 +315,19 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_trim = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 65b847d66470..201b6ca01dd4 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -22,8 +22,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* @@ -682,6 +682,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, vd->vdev_orig_guid); } + + /* grab per-leaf-vdev trim stats */ + if (getstats) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TRIM_PROG, + vd->vdev_trim_prog); + } } return (nv); diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index a92d956cdaa2..179dd8f23ab1 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -625,7 +626,7 @@ vdev_mirror_io_done(zio_t *zio) int good_copies = 0; int unexpected_errors = 0; - if (mm == NULL) + if (mm == NULL || ZIO_IS_TRIM(zio)) return; for (c = 0; c < mm->mm_children; c++) { @@ -763,51 +764,39 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vd type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; #if defined(_KERNEL) diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index d85993bff052..3faa323b3247 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* @@ -80,33 +81,32 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_trim = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_HOLE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 939699cb8373..f02899ec4755 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -156,6 +156,8 @@ uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; +uint32_t zfs_vdev_trim_min_active = 1; +uint32_t zfs_vdev_trim_max_active = 10; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -226,11 +228,14 @@ vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) static inline avl_tree_t * vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) { - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE); + ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || + t == ZIO_TYPE_IOCTL); if (t == ZIO_TYPE_READ) return (&vq->vq_read_offset_tree); - else + else if (t == ZIO_TYPE_WRITE) return (&vq->vq_write_offset_tree); + else + return (NULL); } int @@ -265,6 +270,9 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_removal_min_active); case ZIO_PRIORITY_INITIALIZING: return (zfs_vdev_initializing_min_active); + case ZIO_PRIORITY_AUTO_TRIM: + case ZIO_PRIORITY_MAN_TRIM: + return (zfs_vdev_trim_min_active); default: panic("invalid priority %u", p); return (0); @@ -337,6 +345,9 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: return (zfs_vdev_initializing_max_active); + case ZIO_PRIORITY_AUTO_TRIM: + case ZIO_PRIORITY_MAN_TRIM: + return (zfs_vdev_trim_max_active); default: panic("invalid priority %u", p); return (0); @@ -405,8 +416,12 @@ vdev_queue_init(vdev_t *vd) * The synchronous i/o queues are dispatched in FIFO rather * than LBA order. This provides more consistent latency for * these i/os. + * The same is true of the TRIM queue, where LBA ordering + * doesn't help. */ - if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) + if (p == ZIO_PRIORITY_SYNC_READ || + p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_AUTO_TRIM || p == ZIO_PRIORITY_MAN_TRIM) compfn = vdev_queue_timestamp_compare; else compfn = vdev_queue_offset_compare; @@ -439,7 +454,9 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt != NULL) + avl_add(qtt, zio); if (shk->kstat != NULL) { mutex_enter(&shk->lock); @@ -456,7 +473,9 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt != NULL) + avl_remove(qtt, zio); if (shk->kstat != NULL) { mutex_enter(&shk->lock); @@ -727,7 +746,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) * For LBA-ordered queues (async / scrub / initializing), issue the * i/o which follows the most recently issued i/o in LBA (offset) order. * - * For FIFO queues (sync), issue the i/o with the lowest timestamp. + * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. */ tree = vdev_queue_class_tree(vq, p); vq->vq_io_search.io_timestamp = 0; @@ -759,7 +778,11 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) } vdev_queue_pending_add(vq, zio); - vq->vq_last_offset = zio->io_offset + zio->io_size; + + /* trim I/Os have no single meaningful offset */ + if (zio->io_priority != ZIO_PRIORITY_AUTO_TRIM || + zio->io_priority != ZIO_PRIORITY_MAN_TRIM) + vq->vq_last_offset = zio->io_offset + zio->io_size; return (zio); } @@ -784,13 +807,14 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } else { + ASSERT(ZIO_IS_TRIM(zio)); } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index d10d89f3eca7..11602a506a0d 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include @@ -35,6 +36,7 @@ #include #include #include +#include #ifdef ZFS_DEBUG #include /* vdev_xlate testing */ @@ -139,6 +141,10 @@ vdev_raidz_map_free(raidz_map_t *rm) { int c; + /* raidz_map_t without abd allocation from vdev_raidz_trim() */ + if (rm->rm_col[0].rc_abd == NULL) + goto out; + for (c = 0; c < rm->rm_firstdatacol; c++) { abd_free(rm->rm_col[c].rc_abd); @@ -152,6 +158,7 @@ vdev_raidz_map_free(raidz_map_t *rm) if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); +out: kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -430,18 +437,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; + if (zio->io_abd != NULL) { + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; + off = rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = + abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_size); + off += rm->rm_col[c].rc_size; + } } /* @@ -1621,6 +1631,38 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } +/* + * Converts an allocated size on a raidz vdev back to a logical block + * size. This is used in trimming to figure out the appropriate logical + * size to pass to vdev_raidz_map_alloc when splitting up extents of free + * space obtained from metaslabs. However, a range of free space on a + * raidz vdev might have originally consisted of multiple blocks and + * those, taken together with their skip blocks, might not always align + * neatly to a new vdev_raidz_map_alloc covering the entire unified + * range. So to ensure that the newly allocated raidz map *always* fits + * within the asize passed to this function and never exceeds it (since + * that might trim allocated data past it), we round it down to the + * nearest suitable multiple of the vdev ashift (hence the "_floor" in + * this function's name). + */ +static uint64_t +vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize) +{ + uint64_t psize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + psize = (asize - (nparity << ashift)); + psize /= cols; + psize *= cols - nparity; + psize += (1 << ashift) - 1; + + psize = P2ALIGN(psize, 1 << ashift); + + return (psize); +} + static void vdev_raidz_child_done(zio_t *zio) { @@ -2069,6 +2111,9 @@ vdev_raidz_io_done(zio_t *zio) int tgts[VDEV_RAIDZ_MAXPARITY]; int code; + if (ZIO_IS_TRIM(zio)) + return; + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); @@ -2397,18 +2442,111 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); } +static inline void +vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp, + const raidz_col_t *rc) +{ + uint64_t num_exts = *num_extsp; + ASSERT(rc->rc_size != 0); + + if (dfl->dfl_num_exts > 0 && + dfl->dfl_exts[num_exts - 1].dfle_start + + dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) { + dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size; + } else { + dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset; + dfl->dfl_exts[num_exts].dfle_length = rc->rc_size; + (*num_extsp)++; + } +} + +/* + * Processes a trim for a raidz vdev. Because trims deal with physical + * addresses, we can't simply pass through our logical vdev addresses to + * the underlying devices. Instead, we compute a raidz map based on the + * logical extent addresses provided to us and construct new extent + * lists that then go to each component vdev. + */ +static void +vdev_raidz_trim(vdev_t *vd, zio_t *pio, dkioc_free_list_t *dfl, + boolean_t auto_trim) +{ + dkioc_free_list_t **sub_dfls; + uint64_t *sub_dfls_num_exts; + zio_t *zio; + + sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children, + KM_SLEEP); + sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children, + KM_SLEEP); + zio = kmem_zalloc(sizeof (*zio), KM_SLEEP); + for (int i = 0; i < vd->vdev_children; i++) { + /* + * We might over-allocate here, because the sub-lists can never + * be longer than the parent list, but they can be shorter. + * The underlying driver will discard zero-length extents. + */ + sub_dfls[i] = dfl_alloc(dfl->dfl_num_exts, KM_SLEEP); + sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts; + sub_dfls[i]->dfl_flags = dfl->dfl_flags; + sub_dfls[i]->dfl_offset = dfl->dfl_offset; + /* don't copy the check func, because it isn't raidz-aware */ + } + + /* + * Process all extents and redistribute them to the component vdevs + * according to a computed raidz map geometry. + */ + for (int i = 0; i < dfl->dfl_num_exts; i++) { + uint64_t start = dfl->dfl_exts[i].dfle_start; + uint64_t length = dfl->dfl_exts[i].dfle_length; + uint64_t j; + raidz_map_t *rm; + + zio->io_offset = start; + zio->io_size = vdev_raidz_psize_floor(vd, length); + zio->io_abd = NULL; + + rm = vdev_raidz_map_alloc(zio, vd->vdev_top->vdev_ashift, + vd->vdev_children, vd->vdev_nparity); + + for (j = 0; j < rm->rm_cols; j++) { + uint64_t devidx = rm->rm_col[j].rc_devidx; + vdev_raidz_trim_append_rc(sub_dfls[devidx], + &sub_dfls_num_exts[devidx], &rm->rm_col[j]); + } + vdev_raidz_map_free(rm); + } + + /* + * Issue the component ioctls as children of the parent zio. + */ + for (int i = 0; i < vd->vdev_children; i++) { + if (sub_dfls_num_exts[i] != 0) { + vdev_t *child = vd->vdev_child[i]; + zio_nowait(zio_trim_dfl(pio, child->vdev_spa, child, + sub_dfls[i], B_TRUE, auto_trim, NULL, NULL)); + } else { + dfl_free(sub_dfls[i]); + } + } + kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children); + kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children); + kmem_free(zio, sizeof (*zio)); +} + vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - vdev_raidz_need_resilver, - NULL, - NULL, - NULL, - vdev_raidz_xlate, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_raidz_open, + .vdev_op_close = vdev_raidz_close, + .vdev_op_asize = vdev_raidz_asize, + .vdev_op_io_start = vdev_raidz_io_start, + .vdev_op_io_done = vdev_raidz_io_done, + .vdev_op_state_change = vdev_raidz_state_change, + .vdev_op_need_resilver = vdev_raidz_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_trim = vdev_raidz_trim, + .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index e40b7ce8e4e8..2bb1c0fa431a 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -140,17 +141,17 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_root_open, + .vdev_op_close = vdev_root_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = NULL, /* not applicable to the root */ + .vdev_op_io_done = NULL, /* not applicable to the root */ + .vdev_op_state_change = vdev_root_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, /* not applicable to the root */ + .vdev_op_rele = NULL, /* not applicable to the root */ + .vdev_op_xlate = NULL, /* not applicable to the root */ + .vdev_op_trim = NULL, /* not applicable to the root */ + .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0dfa016845a3..71b200da194c 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1758,6 +1758,36 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_cookie trim_cmd_info_t + */ +static int +zfs_ioc_pool_trim(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + trim_cmd_info_t tci; + + if (ddi_copyin((void *)(uintptr_t)zc->zc_cookie, &tci, + sizeof (tci), 0) == -1) + return (EFAULT); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (tci.tci_start) { + spa_man_trim(spa, tci.tci_rate); + } else { + spa_man_trim_stop(spa); + } + + spa_close(spa, FTAG); + + return (error); +} + static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { @@ -6543,6 +6573,8 @@ zfs_ioctl_init(void) zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_TRIM, + zfs_ioc_pool_trim); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 9ca53490e2c7..c797cdcc16b0 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2017 Nexenta Systems, Inc. All rights reserved. */ #include @@ -47,6 +47,8 @@ #include #include #include +#include +#include /* * ========================================================================== @@ -117,6 +119,14 @@ static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); +/* + * Tunable to allow for debugging SCSI UNMAP/SATA TRIM calls. Disabling + * it will prevent ZFS from attempting to issue DKIOCFREE ioctls to the + * underlying storage. + */ +int zfs_trim = B_TRUE; +int zfs_trim_min_ext_sz = 128 << 10; /* 128k */ + void zio_init(void) { @@ -844,11 +854,25 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, static void zio_destroy(zio_t *zio) { + if (ZIO_IS_TRIM(zio)) { + vdev_t *vd = zio->io_vd; + ASSERT(vd != NULL); + ASSERT(!MUTEX_HELD(&vd->vdev_trim_zios_lock)); + mutex_enter(&vd->vdev_trim_zios_lock); + ASSERT(vd->vdev_trim_zios != 0); + vd->vdev_trim_zios--; + cv_broadcast(&vd->vdev_trim_zios_cv); + mutex_exit(&vd->vdev_trim_zios_lock); + } metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); + if (zio->io_dfl != NULL && zio->io_dfl_free_on_destroy) + dfl_free(zio->io_dfl); + else + ASSERT0(zio->io_dfl_free_on_destroy); kmem_cache_free(zio_cache, zio); } @@ -1210,6 +1234,174 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, return (zio); } +/* + * Performs the same function as zio_trim_tree, but takes a dkioc_free_list_t + * instead of a range tree of extents. The `dfl' argument is stored in the + * zio and shouldn't be altered by the caller after calling zio_trim_dfl. + * If `dfl_free_on_destroy' is true, the zio will destroy and free the list + * using dfl_free after the zio is done executing. + */ +zio_t * +zio_trim_dfl(zio_t *pio, spa_t *spa, vdev_t *vd, dkioc_free_list_t *dfl, + boolean_t dfl_free_on_destroy, boolean_t auto_trim, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + int c; + + ASSERT(dfl->dfl_num_exts != 0); + + if (vd->vdev_ops->vdev_op_leaf) { + /* + * A trim zio is a special ioctl zio that can enter the vdev + * queue. We don't want to be sorted in the queue by offset, + * but sometimes the queue requires that, so we fake an + * offset value. We simply use the offset of the first extent + * and the minimum allocation unit on the vdev to keep the + * queue's algorithms working more-or-less as they should. + */ + uint64_t off = dfl->dfl_exts[0].dfle_start; + + zio = zio_create(pio, spa, 0, NULL, NULL, 1 << vd->vdev_ashift, + 1 << vd->vdev_ashift, done, private, ZIO_TYPE_IOCTL, + auto_trim ? ZIO_PRIORITY_AUTO_TRIM : ZIO_PRIORITY_MAN_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_AGGREGATE, vd, off, + NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); + zio->io_cmd = DKIOCFREE; + zio->io_dfl = dfl; + zio->io_dfl_free_on_destroy = dfl_free_on_destroy; + + mutex_enter(&vd->vdev_trim_zios_lock); + vd->vdev_trim_zios++; + mutex_exit(&vd->vdev_trim_zios_lock); + } else { + /* + * Trims to non-leaf vdevs have two possible paths. For vdevs + * that do not provide a specific trim fanout handler, we + * simply duplicate the trim to each child. vdevs which do + * have a trim fanout handler are responsible for doing the + * fanout themselves. + */ + zio = zio_null(pio, spa, vd, done, private, 0); + zio->io_dfl = dfl; + zio->io_dfl_free_on_destroy = dfl_free_on_destroy; + + if (vd->vdev_ops->vdev_op_trim != NULL) { + vd->vdev_ops->vdev_op_trim(vd, zio, dfl, auto_trim); + } else { + for (c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_trim_dfl(zio, spa, + vd->vdev_child[c], dfl, B_FALSE, auto_trim, + NULL, NULL)); + } + } + } + + return (zio); +} + +/* + * This check is used by zio_trim_tree to set in dfl_ck_func to help debugging + * extent trimming. If the SCSI driver (sd) was compiled with the DEBUG flag + * set, dfl_ck_func is called for every extent to verify that it is indeed + * ok to be trimmed. This function compares the extent address with the tree + * of free blocks (ms_tree) in the metaslab which this trim was originally + * part of. + */ +static void +zio_trim_check(uint64_t start, uint64_t len, void *msp) +{ + metaslab_t *ms = msp; + boolean_t held = MUTEX_HELD(&ms->ms_lock); + if (!held) + mutex_enter(&ms->ms_lock); + ASSERT(ms->ms_trimming_ts != NULL); + if (ms->ms_loaded) + ASSERT(range_tree_contains(ms->ms_trimming_ts->ts_tree, + start - VDEV_LABEL_START_SIZE, len)); + if (!held) + mutex_exit(&ms->ms_lock); +} + +/* + * Takes a bunch of freed extents and tells the underlying vdevs that the + * space associated with these extents can be released. + * This is used by flash storage to pre-erase blocks for rapid reuse later + * and thin-provisioned block storage to reclaim unused blocks. + * This function is actually a front-end to zio_trim_dfl. It simply converts + * the provided range_tree's contents into a dkioc_free_list_t and calls + * zio_trim_dfl with it. The `tree' argument is not used after this function + * returns and can be discarded by the caller. + */ +zio_t * +zio_trim_tree(zio_t *pio, spa_t *spa, vdev_t *vd, struct range_tree *tree, + boolean_t auto_trim, zio_done_func_t *done, void *private, + int dkiocfree_flags, metaslab_t *msp) +{ + dkioc_free_list_t *dfl = NULL; + range_seg_t *rs; + uint64_t rs_idx; + uint64_t num_exts; + uint64_t bytes_issued = 0, bytes_skipped = 0, exts_skipped = 0; + + ASSERT(range_tree_space(tree) != 0); + + num_exts = avl_numnodes(&tree->rt_root); + dfl = dfl_alloc(num_exts, KM_SLEEP); + dfl->dfl_flags = dkiocfree_flags; + dfl->dfl_num_exts = num_exts; + dfl->dfl_offset = VDEV_LABEL_START_SIZE; + if (msp) { + dfl->dfl_ck_func = zio_trim_check; + dfl->dfl_ck_arg = msp; + } + + for (rs = avl_first(&tree->rt_root), rs_idx = 0; rs != NULL; + rs = AVL_NEXT(&tree->rt_root, rs)) { + uint64_t len = rs->rs_end - rs->rs_start; + + /* Skip extents that are too short to bother with. */ + if (len < zfs_trim_min_ext_sz) { + bytes_skipped += len; + exts_skipped++; + continue; + } + + dfl->dfl_exts[rs_idx].dfle_start = rs->rs_start; + dfl->dfl_exts[rs_idx].dfle_length = len; + + /* check we're a multiple of the vdev ashift */ + ASSERT0(dfl->dfl_exts[rs_idx].dfle_start & + ((1 << vd->vdev_ashift) - 1)); + ASSERT0(dfl->dfl_exts[rs_idx].dfle_length & + ((1 << vd->vdev_ashift) - 1)); + + rs_idx++; + bytes_issued += len; + } + + spa_trimstats_update(spa, rs_idx, bytes_issued, exts_skipped, + bytes_skipped); + + /* the zfs_trim_min_ext_sz filter may have shortened the list */ + if (dfl->dfl_num_exts != rs_idx) { + if (rs_idx == 0) { + /* Removing short extents has removed all extents. */ + dfl_free(dfl); + return (zio_null(pio, spa, vd, done, private, 0)); + } + dkioc_free_list_t *dfl2 = dfl_alloc(rs_idx, KM_SLEEP); + bcopy(dfl, dfl2, DFL_SZ(rs_idx)); + dfl2->dfl_num_exts = rs_idx; + dfl_free(dfl); + dfl = dfl2; + } + + return (zio_trim_dfl(pio, spa, vd, dfl, B_TRUE, auto_trim, done, + private)); +} + zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, @@ -3560,6 +3752,30 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * ========================================================================== */ +/* + * Late pipeline bypass for trim zios. Because our zio trim queues can be + * pretty long and we might want to quickly terminate trims for performance + * reasons, we check the following conditions: + * 1) If a manual trim was initiated with the queue full of auto trim zios, + * we want to skip doing the auto trims, because they hold up the manual + * trim unnecessarily. Manual trim processes all empty space anyway. + * 2) If the autotrim property of the pool is flipped to off, usually due to + * performance reasons, we want to stop trying to do autotrims/ + * 3) If a manual trim shutdown was requested, immediately terminate them. + * 4) If a pool vdev reconfiguration is imminent, we must discard all queued + * up trims to let it proceed as quickly as possible. + */ +static inline boolean_t +zio_trim_should_bypass(const zio_t *zio) +{ + ASSERT(ZIO_IS_TRIM(zio)); + return ((zio->io_priority == ZIO_PRIORITY_AUTO_TRIM && + (zio->io_vd->vdev_top->vdev_man_trimming || + zio->io_spa->spa_auto_trim != SPA_AUTO_TRIM_ON)) || + (zio->io_priority == ZIO_PRIORITY_MAN_TRIM && + zio->io_spa->spa_man_trim_stop) || + zio->io_vd->vdev_trim_zios_stop); +} /* * Issue an I/O to the underlying vdev. Typically the issue pipeline @@ -3684,7 +3900,8 @@ zio_vdev_io_start(zio_t *zio) } if (vd->vdev_ops->vdev_op_leaf && - (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || + ZIO_IS_TRIM(zio))) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); @@ -3700,6 +3917,9 @@ zio_vdev_io_start(zio_t *zio) zio->io_delay = gethrtime(); } + if (ZIO_IS_TRIM(zio) && zio_trim_should_bypass(zio)) + return (ZIO_PIPELINE_CONTINUE); + vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } @@ -3715,7 +3935,8 @@ zio_vdev_io_done(zio_t *zio) return (NULL); } - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || ZIO_IS_TRIM(zio)); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; @@ -3734,7 +3955,7 @@ zio_vdev_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); - if (zio->io_error) { + if (zio->io_error && !ZIO_IS_TRIM(zio)) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { @@ -4865,4 +5086,12 @@ MODULE_PARM_DESC(zfs_sync_pass_rewrite, module_param(zio_dva_throttle_enabled, int, 0644); MODULE_PARM_DESC(zio_dva_throttle_enabled, "Throttle block allocations in the ZIO pipeline"); + +module_param(zfs_trim, int, 0644); +MODULE_PARM_DESC(zfs_trim, + "Enable TRIM"); + +module_param(zfs_trim_min_ext_sz, int, 0644); +MODULE_PARM_DESC(zfs_trim_min_ext_sz, + "Minimum size to TRIM"); #endif diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 7395dcb8ddcb..02e2b34eb9af 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -34,7 +34,7 @@ * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index d3ecf6274770..72e6ea2f7a8b 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -836,6 +836,9 @@ tags = ['functional', 'threadsappend'] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos'] tags = ['functional', 'tmpfile'] +[tests/functional/trim] +tests = ['autotrim_001_pos', 'manualtrim_001_pos'] + [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index 90f5e1821318..da27673ec946 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -68,6 +68,7 @@ SUBDIRS = \ sparse \ threadsappend \ tmpfile \ + trim \ truncate \ upgrade \ user_namespace \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 48a32174fa36..db7f67f3c290 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -57,6 +57,8 @@ typeset -a properties=( "fragmentation" "leaked" "multihost" + "forcetrim" + "autotrim" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am new file mode 100644 index 000000000000..a379bf898fd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/trim +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + trim.cfg \ + trim.kshlib \ + cleanup.ksh \ + autotrim_001_pos.ksh \ + manualtrim_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh new file mode 100755 index 000000000000..fc74bb7bf570 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh @@ -0,0 +1,114 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# +# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +set_tunable zfs_trim_min_ext_sz 4096 +set_tunable zfs_txgs_per_trim 2 + +function getsizemb +{ + typeset rval + + rval=$(du --block-size 1048576 -s "$1" | sed -e 's;[ ].*;;') + echo -n "$rval" +} + +function checkvdevs +{ + typeset vd sz + + for vd in $VDEVS; do + sz=$(getsizemb $vd) + log_note Size of $vd is $sz MB + log_must test $sz -le $SHRUNK_SIZE_MB + done +} + +function txgs +{ + typeset x + + # Run some txgs in order to let autotrim do its work. + # + for x in 1 2 3; do + log_must zfs snapshot $TRIMPOOL@snap + log_must zfs destroy $TRIMPOOL@snap + log_must zfs snapshot $TRIMPOOL@snap + log_must zfs destroy $TRIMPOOL@snap + done +} + +# +# Check various pool geometries: Create the pool, fill it, remove the test file, +# run some txgs, export the pool and verify that the vdevs shrunk. +# + +# +# raidz +# +for z in 1 2 3; do + setupvdevs + log_must zpool create -f $TRIMPOOL raidz$z $VDEVS + log_must zpool set autotrim=on $TRIMPOOL + log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w + log_must rm "/$TRIMPOOL/$TESTFILE" + txgs + log_must zpool export $TRIMPOOL + checkvdevs +done + +# +# mirror +# +setupvdevs +log_must zpool create -f $TRIMPOOL mirror $MIRROR_VDEVS_1 mirror $MIRROR_VDEVS_2 +log_must zpool set autotrim=on $TRIMPOOL +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w +log_must rm "/$TRIMPOOL/$TESTFILE" +txgs +log_must zpool export $TRIMPOOL +checkvdevs + +# +# stripe +# +setupvdevs +log_must zpool create -f $TRIMPOOL $STRIPE_VDEVS +log_must zpool set autotrim=on $TRIMPOOL +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w +log_must rm "/$TRIMPOOL/$TESTFILE" +txgs +log_must zpool export $TRIMPOOL +checkvdevs + +log_pass TRIM successfully shrunk vdevs diff --git a/tests/zfs-tests/tests/functional/trim/cleanup.ksh b/tests/zfs-tests/tests/functional/trim/cleanup.ksh new file mode 100755 index 000000000000..e8d1515e660a --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg + +rm -f $VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh new file mode 100755 index 000000000000..7603a85cfd26 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh @@ -0,0 +1,100 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# +# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +set_tunable zfs_trim_min_ext_sz 4096 + +function getsizemb +{ + typeset rval + + rval=$(du --block-size 1048576 -s "$1" | sed -e 's;[ ].*;;') + echo -n "$rval" +} + +function checkvdevs +{ + typeset vd sz + + for vd in $VDEVS; do + sz=$(getsizemb $vd) + log_note Size of $vd is $sz MB + log_must test $sz -le $SHRUNK_SIZE_MB + done +} + +function dotrim +{ + log_must rm "/$TRIMPOOL/$TESTFILE" + log_must zpool export $TRIMPOOL + log_must zpool import -d $VDEVDIR $TRIMPOOL + log_must zpool trim $TRIMPOOL + sleep 5 + log_must zpool export $TRIMPOOL +} + +# +# Check various pool geometries: Create the pool, fill it, remove the test file, +# perform a manual trim, export the pool and verify that the vdevs shrunk. +# + +# +# raidz +# +for z in 1 2 3; do + setupvdevs + log_must zpool create -f $TRIMPOOL raidz$z $VDEVS + log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w + dotrim + checkvdevs +done + +# +# mirror +# +setupvdevs +log_must zpool create -f $TRIMPOOL mirror $MIRROR_VDEVS_1 mirror $MIRROR_VDEVS_2 +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w +dotrim +checkvdevs + +# +# stripe +# +setupvdevs +log_must zpool create -f $TRIMPOOL $STRIPE_VDEVS +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w +dotrim +checkvdevs + +log_pass Manual TRIM successfully shrunk vdevs diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh new file mode 100755 index 000000000000..feb9ef2ed7ea --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +log_pass TRIM setup succeeded diff --git a/tests/zfs-tests/tests/functional/trim/trim.cfg b/tests/zfs-tests/tests/functional/trim/trim.cfg new file mode 100644 index 000000000000..ab7e2291d074 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/trim.cfg @@ -0,0 +1,60 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +# +# Parameters +# +TRIMPOOL=trimpool +VDEVDIR="/tmp" +VDEVS="/tmp/trim1.dev /tmp/trim2.dev /tmp/trim3.dev /tmp/trim4.dev /tmp/trim5.dev" +VDEV_SIZE=128m +TESTFILE=testfile +SHRUNK_SIZE_MB=20 + +NUM_WRITES=2048 +BLOCKSIZE=65536 + +# +# Computed values and parameters +# +function get_mirror_vdevs +{ + set -- $VDEVS + MIRROR_VDEVS_1="$1 $2" + MIRROR_VDEVS_2="$3 $4" +} +get_mirror_vdevs + +function get_stripe_vdevs +{ + set -- $VDEVS + STRIPE_VDEVS="$1 $2 $3 $4" +} +get_stripe_vdevs diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib new file mode 100644 index 000000000000..041c1f0754b7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +function set_tunable +{ + typeset tunable="$1" + typeset value="$2" + typeset zfs_tunables="/sys/module/zfs/parameters" + + [[ -z "$tunable" ]] && return 1 + [[ -z "$value" ]] && return 1 + [[ -f "$zfs_tunables/$tunable" ]] || return 1 + + echo -n "$value" > "$zfs_tunables/$tunable" + return "$?" +} + +function find_scsi_debug +{ + grep -H scsi_debug /sys/block/*/device/model | $AWK -F/ '{print $4}' | tr '\n' ' ' +} + +function setupvdevs +{ + log_must rm -f $VDEVS + log_must truncate -s 192m $VDEVS +} From 1748965bc7d7a2d2f413b4efaeca45096809087d Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Sat, 15 Apr 2017 02:48:16 +0200 Subject: [PATCH 03/38] Trimming an offlined vdev asserts in zio_create. Requires-builders: none --- module/zfs/zio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c797cdcc16b0..3b27bb64dd9b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1251,7 +1251,12 @@ zio_trim_dfl(zio_t *pio, spa_t *spa, vdev_t *vd, dkioc_free_list_t *dfl, ASSERT(dfl->dfl_num_exts != 0); - if (vd->vdev_ops->vdev_op_leaf) { + if (!vdev_writeable(vd)) { + /* Skip unavailable vdevs, just create a dummy zio. */ + zio = zio_null(pio, spa, vd, done, private, 0); + zio->io_dfl = dfl; + zio->io_dfl_free_on_destroy = dfl_free_on_destroy; + } else if (vd->vdev_ops->vdev_op_leaf) { /* * A trim zio is a special ioctl zio that can enter the vdev * queue. We don't want to be sorted in the queue by offset, From 6f7dc83c0d17fa6ee0590eb6594c7bcb4508c4c0 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sat, 15 Apr 2017 10:59:49 -0500 Subject: [PATCH 04/38] Want extended zpool iostat trim support The extended zpool iostat options -wlqr will display information about automatic and manual TRIMs. This commit also fixes a completely unrelated bug in which the IOS_LATENCY row in the vsx_type_to_nvlist array was missing an entry for the scrub nvlist. Requires-builders: none --- cmd/zpool/zpool_main.c | 40 +++++++++++++++++++++++++++++++--------- include/sys/fs/zfs.h | 14 ++++++++++++++ module/zfs/vdev_label.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 9 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 128fbc1773ee..3492bddb7689 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -195,7 +195,7 @@ enum iostat_type { * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ -static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { +static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, @@ -206,12 +206,17 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_MAN_TRIM_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_MAN_TRIM_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, @@ -219,6 +224,8 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_MAN_TRIM_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, @@ -231,6 +238,10 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_IND_AUTO_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_IND_MAN_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO, NULL}, }; @@ -3366,21 +3377,22 @@ typedef struct name_and_columns { unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; -#define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */ +#define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, - {"asyncq_wait", 2}, {"scrub"}}, + {"asyncq_wait", 2}, {"scrub"}, {"atrim"}, {"mtrim"}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, - {NULL}}, + {"auto_trimq", 2}, {"man_trimq", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, - {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {NULL}}, + {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, + {"auto_trim", 2}, {"man_trim", 2}, {NULL}}, }; @@ -3390,13 +3402,17 @@ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, - {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}}, + {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, + {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, - {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, + {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, - {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {"atrim"}, + {"mtrim"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, - {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, + {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { @@ -4020,6 +4036,10 @@ print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_MAN_TRIM_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_MAN_TRIM_ACTIVE_QUEUE, }; struct stat_array *nva; @@ -4058,6 +4078,8 @@ print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_AUTO_TRIM_LAT_HISTO, + ZPOOL_CONFIG_VDEV_MAN_TRIM_LAT_HISTO, }; struct stat_array *nva; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4a4ed441ac5f..f6b79b6f7ff6 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -636,6 +636,10 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" +#define ZPOOL_CONFIG_VDEV_AUTO_TRIM_ACTIVE_QUEUE \ + "vdev_async_auto_trim_active_queue" +#define ZPOOL_CONFIG_VDEV_MAN_TRIM_ACTIVE_QUEUE \ + "vdev_async_man_trim_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" @@ -643,6 +647,10 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" +#define ZPOOL_CONFIG_VDEV_AUTO_TRIM_PEND_QUEUE \ + "vdev_async_auto_trim_pend_queue" +#define ZPOOL_CONFIG_VDEV_MAN_TRIM_PEND_QUEUE \ + "vdev_async_man_trim_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" @@ -654,6 +662,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" +#define ZPOOL_CONFIG_VDEV_AUTO_TRIM_LAT_HISTO "vdev_auto_trim_histo" +#define ZPOOL_CONFIG_VDEV_MAN_TRIM_LAT_HISTO "vdev_man_trim_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" @@ -666,6 +676,10 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" +#define ZPOOL_CONFIG_VDEV_IND_AUTO_TRIM_HISTO "vdev_ind_auto_trim_histo" +#define ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO "vdev_agg_auto_trim_histo" +#define ZPOOL_CONFIG_VDEV_IND_MAN_TRIM_HISTO "vdev_ind_man_trim_histo" +#define ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO "vdev_agg_man_trim_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 201b6ca01dd4..64d2e37bdc46 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -251,6 +251,12 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_AUTO_TRIM_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_AUTO_TRIM]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_MAN_TRIM_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_MAN_TRIM]); + /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); @@ -267,6 +273,12 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_AUTO_TRIM_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_AUTO_TRIM]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_MAN_TRIM_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_MAN_TRIM]); + /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], @@ -304,6 +316,14 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AUTO_TRIM_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_AUTO_TRIM], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_AUTO_TRIM])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_MAN_TRIM_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_MAN_TRIM], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_MAN_TRIM])); + /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], @@ -325,6 +345,14 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_AUTO_TRIM_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_AUTO_TRIM], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_AUTO_TRIM])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_MAN_TRIM_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_MAN_TRIM], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_MAN_TRIM])); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); @@ -348,6 +376,14 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_MAN_TRIM])); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); From 0f0c1030ec80725919d9c80188356cf730824490 Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Wed, 19 Apr 2017 16:51:30 +0200 Subject: [PATCH 05/38] Matt Ahrens' review comments, round 2. Brian Behlendorf's review comments. Requires-builders: none --- module/zfs/metaslab.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 4dbb08ac7db0..a8dd20c9c02f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -249,6 +249,14 @@ kmem_cache_t *metaslab_alloc_trace_cache; * recovery (extents won't get trimmed immediately, but instead only * after passing this rather long timeout, thus preserving * 'zfs import -F' functionality). + * The exact default value of this tunable is a tradeoff between: + * 1) Keeping the trim commands reasonably small. + * 2) Keeping the ability to rollback back for as many txgs as possible. + * 3) Waiting around too long that the user starts to get uneasy about not + * seeing any space being freed after they remove some files. + * The default value of 32 is the maximum number of uberblocks in a vdev + * label, assuming a 4k physical sector size (which seems to be the almost + * universal smallest sector size used in SSDs). */ unsigned int zfs_txgs_per_trim = 32; /* @@ -2740,8 +2748,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * just emptied out the defer_tree. */ if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON && - !vd->vdev_man_trimming) + !vd->vdev_man_trimming) { range_tree_walk(*defer_tree, metaslab_trim_add, msp); + if (!defer_allowed) { + range_tree_walk(msp->ms_freedtree, metaslab_trim_add, + msp); + } + } range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { @@ -4498,6 +4511,8 @@ metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size) range_tree_clear(msp->ms_cur_ts->ts_tree, offset, size); if (msp->ms_prev_ts != NULL) range_tree_clear(msp->ms_prev_ts->ts_tree, offset, size); + ASSERT(msp->ms_trimming_ts == NULL || + !range_tree_contains(msp->ms_trimming_ts->ts_tree, offset, size)); } /* @@ -4518,8 +4533,7 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) } /* - * Does a metaslab's automatic trim operation processing. This must be - * called from metaslab_sync, with the txg number of the txg. This function + * Does a metaslab's automatic trim operation processing. This function * issues trims in intervals as dictated by the zfs_txgs_per_trim tunable. * If the previous trimset has not yet finished trimming, this function * decides what to do based on `preserve_spilled'. If preserve_spilled is @@ -4670,9 +4684,13 @@ metaslab_trim_done(zio_t *zio) * until that trim completes. * The `auto_trim' argument signals whether the trim is being invoked on * behalf of auto or manual trim. The differences are: - * 1) For auto trim the trimset is split up into zios of no more than - * zfs_max_bytes_per_trim bytes. Manual trim already does this - * earlier, so the whole trimset is issued in a single zio. + * 1) For auto trim the trimset is split up into subtrees, each containing no + * more than zfs_max_bytes_per_trim total bytes. Each subtree is then + * trimmed in one zio. This is done to limit the number of LBAs per + * trim command, as many devices perform suboptimally with large trim + * commands, even if they indicate support for them. Manual trim already + * applies this limit earlier by limiting the trimset size, so the + * whole trimset can be issued in a single zio. * 2) The zio(s) generated are tagged with either ZIO_PRIORITY_AUTO_TRIM or * ZIO_PRIORITY_MAN_TRIM to allow differentiating them further down * the pipeline (see zio_priority_t in sys/zio_priority.h). From 3ccb6dd9d05c2a40b1a9685ad125265a2992f1f5 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 18 Apr 2017 19:50:13 -0400 Subject: [PATCH 06/38] Async TRIM, Extended Stats The blkdev_issue_discard() function has been available for a long time by the kernel but it only supports synchronous discards. The __blkdev_issue_discard() function provides an asynchronous interface but was added in the 4.6 kernel. Only supporting synchronously discards can potentially limit performance when processing a large number of small extents. To avoid this an asynchronous discard implementation has been added to vdev_disk.c which builds on existing functionality. The kernel provided synchronous version remains the default pending additional functional and performance testing. Due to different mechamism used for submitting TRIM commands there were not being properly accounted for in the extended statistics. Resolve this by allow for aggregated stats to be returned as part of the TRIM zio. This allows for far better visibility in to the discard request sizes. Minor documentation updates. Signed-off-by: Brian Behlendorf Requires-builders: none --- cmd/zpool/zpool_main.c | 9 +- include/linux/blkdev_compat.h | 34 +++++ include/sys/fs/zfs.h | 25 +++- include/sys/vdev.h | 11 ++ include/sys/zio.h | 2 + man/man8/zpool.8 | 3 - module/zfs/vdev.c | 60 ++++++++- module/zfs/vdev_disk.c | 229 +++++++++++++++++++++++++++------- module/zfs/vdev_file.c | 7 ++ module/zfs/zio.c | 27 ++-- 10 files changed, 332 insertions(+), 75 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3492bddb7689..19cb8f682e01 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -195,7 +195,7 @@ enum iostat_type { * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ -static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { +static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, @@ -239,9 +239,7 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_AUTO_TRIM_HISTO, - ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO, ZPOOL_CONFIG_VDEV_IND_MAN_TRIM_HISTO, - ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO, NULL}, }; @@ -3392,7 +3390,7 @@ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, - {"auto_trim", 2}, {"man_trim", 2}, {NULL}}, + {"trim", 2}, {NULL}}, }; @@ -3411,8 +3409,7 @@ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = {"write"}, {"read"}, {"write"}, {"scrub"}, {"atrim"}, {"mtrim"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, - {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, - {"agg"}, {NULL}}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"auto"}, {"man"}, {NULL}}, }; static const char *histo_to_title[] = { diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index 274552d5dc48..14633447bf1f 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -539,6 +539,40 @@ bio_is_fua(struct bio *bio) #endif } +/* + * bio_set_discard - Set the appropriate flags in a bio to indicate + * that the specific random of sectors should be discarded. + * + * 4.8 - 4.x API, + * REQ_OP_DISCARD + * + * 2.6.36 - 4.7 API, + * REQ_DISCARD + * + * 2.6.28 - 2.6.35 API, + * BIO_RW_DISCARD + * + * In all cases the normal I/O path is used for discards. The only + * difference is how the kernel tags individual I/Os as discards. + * + * Note that 2.6.32 era kernels provide both BIO_RW_DISCARD and REQ_DISCARD, + * where BIO_RW_DISCARD is the correct interface. Therefore, it is important + * that the HAVE_BIO_RW_DISCARD check occur before the REQ_DISCARD check. + */ +static inline void +bio_set_discard(struct bio *bio) +{ +#if defined(HAVE_REQ_OP_DISCARD) + bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); +#elif defined(HAVE_BIO_RW_DISCARD) + bio_set_op_attrs(bio, (1 << BIO_RW_DISCARD), 0); +#elif defined(REQ_DISCARD) + bio_set_op_attrs(bio, REQ_WRITE | REQ_DISCARD, 0); +#else +#error "Allowing the build will cause discard requests to become writes." +#endif +} + /* * 4.8 - 4.x API, * REQ_OP_DISCARD diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f6b79b6f7ff6..6d01882352f6 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -637,9 +637,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_AUTO_TRIM_ACTIVE_QUEUE \ - "vdev_async_auto_trim_active_queue" + "vdev_async_auto_trim_active_queue" #define ZPOOL_CONFIG_VDEV_MAN_TRIM_ACTIVE_QUEUE \ - "vdev_async_man_trim_active_queue" + "vdev_async_man_trim_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" @@ -648,9 +648,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_AUTO_TRIM_PEND_QUEUE \ - "vdev_async_auto_trim_pend_queue" + "vdev_async_auto_trim_pend_queue" #define ZPOOL_CONFIG_VDEV_MAN_TRIM_PEND_QUEUE \ - "vdev_async_man_trim_pend_queue" + "vdev_async_man_trim_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" @@ -677,9 +677,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_AUTO_TRIM_HISTO "vdev_ind_auto_trim_histo" -#define ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO "vdev_agg_auto_trim_histo" #define ZPOOL_CONFIG_VDEV_IND_MAN_TRIM_HISTO "vdev_ind_man_trim_histo" -#define ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO "vdev_agg_man_trim_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" @@ -1099,6 +1097,21 @@ typedef enum pool_initialize_func { POOL_INITIALIZE_FUNCS } pool_initialize_func_t; +/* + * Discard stats + * + * Aggregate statistics for all discards issued as part of a zio TRIM. + * They are merged with standard and extended stats when the zio is done. + */ +typedef struct vdev_stat_trim { + uint64_t vsd_ops; + uint64_t vsd_bytes; + uint64_t vsd_ind_histo[VDEV_RQ_HISTO_BUCKETS]; + uint64_t vsd_queue_histo[VDEV_L_HISTO_BUCKETS]; + uint64_t vsd_disk_histo[VDEV_L_HISTO_BUCKETS]; + uint64_t vsd_total_histo[VDEV_L_HISTO_BUCKETS]; +} vdev_stat_trim_t; + /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 3e4307d6fdf3..9b09a2242706 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -54,6 +54,15 @@ typedef struct vdev_trim_info { void *vti_done_arg; } vdev_trim_info_t; +typedef enum vdev_trim_stat_flags +{ + TRIM_STAT_OP = 1 << 0, + TRIM_STAT_RQ_HISTO = 1 << 1, + TRIM_STAT_L_HISTO = 1 << 2, +} vdev_trim_stat_flags_t; + +#define TRIM_STAT_ALL (TRIM_STAT_OP | TRIM_STAT_RQ_HISTO | TRIM_STAT_L_HISTO) + extern int zfs_nocacheflush; extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); @@ -108,6 +117,8 @@ extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); +extern void vdev_trim_stat_update(zio_t *zio, uint64_t psize, + vdev_trim_stat_flags_t flags); extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, diff --git a/include/sys/zio.h b/include/sys/zio.h index f3989c2086bb..f0f0178ff4e6 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -284,6 +284,7 @@ typedef void zio_done_func_t(zio_t *zio); extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; extern int zfs_trim; +extern int zfs_trim_sync; struct range_tree; @@ -476,6 +477,7 @@ struct zio { /* Used by trim zios */ dkioc_free_list_t *io_dfl; + vdev_stat_trim_t *io_dfl_stats; boolean_t io_dfl_free_on_destroy; /* Stuff for the vdev stack */ diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 6eabf49d9a97..8f35ca0ee7ce 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -195,9 +195,6 @@ .Cm sync .Oo Ar pool Oc Ns ... .Nm -.Cm trim -.Oo Fl pr Ar pool -.Nm .Cm upgrade .Nm .Cm upgrade diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 58e60ef003e3..22780cbfdb2a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4044,13 +4044,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_self_healed += psize; } + if ((!vd->vdev_ops->vdev_op_leaf) || + (zio->io_priority >= ZIO_PRIORITY_NUM_QUEUEABLE)) { + mutex_exit(&vd->vdev_stat_lock); + return; + } + /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). + * Successful TRIM zios include aggregate statistics for all + * discards which resulted from the single TRIM zio. */ - if (vd->vdev_ops->vdev_op_leaf && - (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { - + if (!ZIO_IS_TRIM(zio)) { vs->vs_ops[type]++; vs->vs_bytes[type] += psize; @@ -4070,6 +4076,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } + } else if (zio->io_dfl_stats != NULL) { + vdev_stat_trim_t *vsd = zio->io_dfl_stats; + + vs->vs_ops[type] += vsd->vsd_ops; + vs->vs_bytes[type] += vsd->vsd_bytes; + + for (int i = 0; i < VDEV_RQ_HISTO_BUCKETS; i++) + vsx->vsx_ind_histo[zio->io_priority][i] += + vsd->vsd_ind_histo[i]; + + for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { + vsx->vsx_queue_histo[zio->io_priority][i] += + vsd->vsd_queue_histo[i]; + vsx->vsx_disk_histo[type][i] += + vsd->vsd_disk_histo[i]; + vsx->vsx_total_histo[type][i] += + vsd->vsd_total_histo[i]; + } } mutex_exit(&vd->vdev_stat_lock); @@ -4161,7 +4185,35 @@ vdev_deflated_space(vdev_t *vd, int64_t space) } /* - * Update the in-core space usage stats for this vdev and the root vdev. + * Update the aggregate statistics for a TRIM zio. + */ +void +vdev_trim_stat_update(zio_t *zio, uint64_t psize, vdev_trim_stat_flags_t flags) +{ + vdev_stat_trim_t *vsd = zio->io_dfl_stats; + hrtime_t now = gethrtime(); + hrtime_t io_delta = io_delta = now - zio->io_timestamp; + hrtime_t io_delay = now - zio->io_delay; + + if (flags & TRIM_STAT_OP) { + vsd->vsd_ops++; + vsd->vsd_bytes += psize; + } + + if (flags & TRIM_STAT_RQ_HISTO) { + vsd->vsd_ind_histo[RQ_HISTO(psize)]++; + } + + if (flags & TRIM_STAT_L_HISTO) { + vsd->vsd_queue_histo[L_HISTO(io_delta - io_delay)]++; + vsd->vsd_disk_histo[L_HISTO(io_delay)]++; + vsd->vsd_total_histo[L_HISTO(io_delta)]++; + } +} + +/* + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 3cbb5b956c68..10060b9c2e6f 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -379,14 +379,13 @@ vdev_disk_dio_alloc(int bio_count) dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); - if (dr) { - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - for (i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - } + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; return (dr); } @@ -438,6 +437,25 @@ vdev_disk_dio_put(dio_request_t *dr) return (rc); } +#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) +static void +vdev_disk_dio_blk_start_plug(dio_request_t *dr, struct blk_plug *plug) +{ + if (dr->dr_bio_count > 1) + blk_start_plug(plug); +} + +static void +vdev_disk_dio_blk_finish_plug(dio_request_t *dr, struct blk_plug *plug) +{ + if (dr->dr_bio_count > 1) + blk_finish_plug(plug); +} +#else +#define vdev_disk_dio_blk_start_plug(dr, plug) ((void)0) +#define vdev_disk_dio_blk_finish_plug(dr, plug) ((void)0) +#endif /* HAVE_BLK_QUEUE_HAVE_BLK_PLUG */ + BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) { dio_request_t *dr = bio->bi_private; @@ -629,22 +647,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); - -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) - if (dr->dr_bio_count > 1) - blk_start_plug(&plug); -#endif + vdev_disk_dio_blk_start_plug(dr, &plug); /* Submit all bio's associated with this dio */ for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) - if (dr->dr_bio_count > 1) - blk_finish_plug(&plug); -#endif - + vdev_disk_dio_blk_finish_plug(dr, &plug); (void) vdev_disk_dio_put(dr); return (error); @@ -694,6 +704,151 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +static int +vdev_disk_io_discard_sync(struct block_device *bdev, zio_t *zio) +{ + dkioc_free_list_t *dfl = zio->io_dfl; + + zio->io_dfl_stats = kmem_zalloc(sizeof (vdev_stat_trim_t), KM_SLEEP); + + for (int i = 0; i < dfl->dfl_num_exts; i++) { + int error; + + if (dfl->dfl_exts[i].dfle_length == 0) + continue; + + error = -blkdev_issue_discard(bdev, + (dfl->dfl_exts[i].dfle_start + dfl->dfl_offset) >> 9, + dfl->dfl_exts[i].dfle_length >> 9, GFP_NOFS, 0); + if (error != 0) { + return (SET_ERROR(error)); + } else { + vdev_trim_stat_update(zio, + dfl->dfl_exts[i].dfle_length, TRIM_STAT_ALL); + } + } + + return (0); +} + +BIO_END_IO_PROTO(vdev_disk_io_discard_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + zio_t *zio = dr->dr_zio; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* + * Only the latency is updated at completion. The ops and request + * size must be update when submitted since the size is no longer + * available as part of the bio. + */ + vdev_trim_stat_update(zio, 0, TRIM_STAT_L_HISTO); + + /* Drop reference acquired by vdev_disk_io_discard() */ + (void) vdev_disk_dio_put(dr); +} + +/* + * zio->io_dfl contains a dkioc_free_list_t specifying which offsets are to + * be freed. Individual bio requests are constructed for each discard and + * submitted to the block layer to be handled asynchronously. Any range + * with a length of zero or a length larger than UINT_MAX are ignored. + */ +static int +vdev_disk_io_discard(struct block_device *bdev, zio_t *zio) +{ + dio_request_t *dr; + dkioc_free_list_t *dfl = zio->io_dfl; + unsigned int max_discard_sectors; + unsigned int alignment, granularity; + struct request_queue *q; +#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) + struct blk_plug plug; +#endif + + q = bdev_get_queue(bdev); + if (!q) + return (SET_ERROR(ENXIO)); + + if (!blk_queue_discard(q)) + return (SET_ERROR(ENOTSUP)); + + zio->io_dfl_stats = kmem_zalloc(sizeof (vdev_stat_trim_t), KM_SLEEP); + dr = vdev_disk_dio_alloc(0); + dr->dr_zio = zio; + + granularity = MAX(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = MIN(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors -= max_discard_sectors % granularity; + + /* Extra reference to protect dio_request during vdev_submit_bio */ + vdev_disk_dio_get(dr); + vdev_disk_dio_blk_start_plug(dr, &plug); + + for (int i = 0; i < dfl->dfl_num_exts; i++) { + uint64_t nr_sectors = dfl->dfl_exts[i].dfle_length >> 9; + uint64_t sector = (dfl->dfl_exts[i].dfle_start + + dfl->dfl_offset) >> 9; + struct bio *bio; + unsigned int request_sectors; + sector_t end_sector; + + while (nr_sectors > 0) { + bio = bio_alloc(GFP_NOIO, 1); + if (unlikely(bio == NULL)) + break; + + request_sectors = min_t(sector_t, nr_sectors, + max_discard_sectors); + + /* When splitting requests align the end of each. */ + end_sector = sector + request_sectors; + if (request_sectors < nr_sectors && + (end_sector % granularity) != alignment) { + end_sector = ((end_sector - alignment) / + granularity) * granularity + alignment; + request_sectors = end_sector - sector; + } + + bio_set_dev(bio, bdev); + bio->bi_end_io = vdev_disk_io_discard_completion; + bio->bi_private = dr; + bio_set_discard(bio); + BIO_BI_SECTOR(bio) = sector; + BIO_BI_SIZE(bio) = request_sectors << 9; + + nr_sectors -= request_sectors; + sector = end_sector; + + vdev_trim_stat_update(zio, BIO_BI_SIZE(bio), + TRIM_STAT_OP | TRIM_STAT_RQ_HISTO); + + /* Matching put in vdev_disk_discard_completion */ + vdev_disk_dio_get(dr); + vdev_submit_bio(bio); + + cond_resched(); + } + } + + vdev_disk_dio_blk_finish_plug(dr, &plug); + (void) vdev_disk_dio_put(dr); + + return (0); +} + static void vdev_disk_io_start(zio_t *zio) { @@ -756,8 +911,6 @@ vdev_disk_io_start(zio_t *zio) break; case DKIOCFREE: - { - dkioc_free_list_t *dfl; if (!zfs_trim) break; @@ -775,35 +928,19 @@ vdev_disk_io_start(zio_t *zio) break; } - /* - * zio->io_dfl contains a dkioc_free_list_t - * specifying which offsets are to be freed - */ - dfl = zio->io_dfl; - ASSERT(dfl != NULL); - - for (int i = 0; i < dfl->dfl_num_exts; i++) { - int error; - - if (dfl->dfl_exts[i].dfle_length == 0) - continue; - - error = -blkdev_issue_discard(vd->vd_bdev, - (dfl->dfl_exts[i].dfle_start + - dfl->dfl_offset) >> 9, - dfl->dfl_exts[i].dfle_length >> 9, - GFP_NOFS, 0); - - if (error != 0) { - if (error == EOPNOTSUPP || - error == ENXIO) - v->vdev_notrim = B_TRUE; - zio->io_error = SET_ERROR(error); - break; - } + if (zfs_trim_sync) { + error = vdev_disk_io_discard_sync(vd->vd_bdev, + zio); + } else { + error = vdev_disk_io_discard(vd->vd_bdev, zio); + if (error == 0) + return; } + + zio->io_error = error; + break; - } + default: zio->io_error = SET_ERROR(ENOTSUP); } diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 8486ce2b259a..e1d52719a6c3 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -235,6 +235,10 @@ vdev_file_io_start(zio_t *zio) ASSERT(dfl != NULL); if (!zfs_trim) break; + + zio->io_dfl_stats = kmem_zalloc( + sizeof (vdev_stat_trim_t), KM_SLEEP); + for (int i = 0; i < dfl->dfl_num_exts; i++) { struct flock flck; int error; @@ -254,6 +258,9 @@ vdev_file_io_start(zio_t *zio) if (error != 0) { zio->io_error = SET_ERROR(error); break; + } else { + vdev_trim_stat_update(zio, flck.l_len, + TRIM_STAT_ALL); } } break; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b27bb64dd9b..9990bb3c6293 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -126,6 +126,7 @@ static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); */ int zfs_trim = B_TRUE; int zfs_trim_min_ext_sz = 128 << 10; /* 128k */ +int zfs_trim_sync = B_TRUE; void zio_init(void) @@ -869,6 +870,8 @@ zio_destroy(zio_t *zio) list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); + if (zio->io_dfl_stats != NULL) + kmem_free(zio->io_dfl_stats, sizeof (vdev_stat_trim_t)); if (zio->io_dfl != NULL && zio->io_dfl_free_on_destroy) dfl_free(zio->io_dfl); else @@ -4089,14 +4092,17 @@ zio_vdev_io_assess(zio_t *zio) } /* - * If a cache flush returns ENOTSUP or ENOTTY, we know that no future - * attempts will ever succeed. In this case we set a persistent bit so - * that we don't bother with it in the future. + * If a cache flush or discard returns ENOTSUP or ENOTTY, we know that + * no future attempts will ever succeed. In this case we set a + * persistent bit so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) - vd->vdev_nowritecache = B_TRUE; + zio->io_type == ZIO_TYPE_IOCTL && vd != NULL) { + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) + vd->vdev_nowritecache = B_TRUE; + if (zio->io_cmd == DKIOCFREE) + vd->vdev_notrim = B_TRUE; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -5093,10 +5099,11 @@ MODULE_PARM_DESC(zio_dva_throttle_enabled, "Throttle block allocations in the ZIO pipeline"); module_param(zfs_trim, int, 0644); -MODULE_PARM_DESC(zfs_trim, - "Enable TRIM"); +MODULE_PARM_DESC(zfs_trim, "Enable TRIM"); module_param(zfs_trim_min_ext_sz, int, 0644); -MODULE_PARM_DESC(zfs_trim_min_ext_sz, - "Minimum size to TRIM"); +MODULE_PARM_DESC(zfs_trim_min_ext_sz, "Minimum size to TRIM"); + +module_param(zfs_trim_sync, int, 0644); +MODULE_PARM_DESC(zfs_trim_sync, "Issue TRIM commands synchronously"); #endif From f05d5e0d0d585eec37e93947c42a616d96d0c441 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 26 Apr 2017 20:38:55 -0400 Subject: [PATCH 07/38] Review feedback Requires-builders: none --- lib/libspl/include/sys/dkio.h | 3 ++- module/zfs/vdev_disk.c | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h index 5b537dd959cc..0a4fb5f51f74 100644 --- a/lib/libspl/include/sys/dkio.h +++ b/lib/libspl/include/sys/dkio.h @@ -552,7 +552,8 @@ typedef struct dkioc_free_list_s { dkioc_free_list_ext_t dfl_exts[1]; } dkioc_free_list_t; #define DFL_SZ(num_exts) \ - (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) + (sizeof (dkioc_free_list_t) +\ + (num_exts - 1) * sizeof (dkioc_free_list_ext_t)) #ifdef __cplusplus } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 10060b9c2e6f..824fae692b61 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -335,6 +335,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; + /* Set TRIM flag based on support reported by the underlying device. */ + v->vdev_notrim = !blk_queue_discard(bdev_get_queue(vd->vd_bdev)); + /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); @@ -441,15 +444,13 @@ vdev_disk_dio_put(dio_request_t *dr) static void vdev_disk_dio_blk_start_plug(dio_request_t *dr, struct blk_plug *plug) { - if (dr->dr_bio_count > 1) - blk_start_plug(plug); + blk_start_plug(plug); } static void vdev_disk_dio_blk_finish_plug(dio_request_t *dr, struct blk_plug *plug) { - if (dr->dr_bio_count > 1) - blk_finish_plug(plug); + blk_finish_plug(plug); } #else #define vdev_disk_dio_blk_start_plug(dr, plug) ((void)0) @@ -566,7 +567,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, uint64_t abd_offset; uint64_t bio_offset; int bio_size, bio_count = 16; - int i = 0, error = 0; + int i = 0, error = 0, should_plug = 0; #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) struct blk_plug plug; #endif @@ -607,6 +608,10 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, if (bio_size <= 0) break; + /* Plug the device when submitting multiple bio */ + if (!should_plug && i >= 1) + should_plug = 1; + /* * By default only 'bio_count' bio's per dio are allowed. * However, if we find ourselves in a situation where more @@ -647,14 +652,18 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); - vdev_disk_dio_blk_start_plug(dr, &plug); + + if (should_plug) + vdev_disk_dio_blk_start_plug(dr, &plug); /* Submit all bio's associated with this dio */ for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); - vdev_disk_dio_blk_finish_plug(dr, &plug); + if (should_plug) + vdev_disk_dio_blk_finish_plug(dr, &plug); + (void) vdev_disk_dio_put(dr); return (error); From 957d56f3edb3fdaf11c7768bab07fa3dfdd82d85 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 28 Apr 2017 15:53:30 -0400 Subject: [PATCH 08/38] Fix abd_alloc_sametype() panic Signed-off-by: Brian Behlendorf Requires-builders: none --- module/zfs/zio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 9990bb3c6293..c9ad87482432 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1269,9 +1269,10 @@ zio_trim_dfl(zio_t *pio, spa_t *spa, vdev_t *vd, dkioc_free_list_t *dfl, * queue's algorithms working more-or-less as they should. */ uint64_t off = dfl->dfl_exts[0].dfle_start; + uint64_t size = 1 << vd->vdev_top->vdev_ashift; - zio = zio_create(pio, spa, 0, NULL, NULL, 1 << vd->vdev_ashift, - 1 << vd->vdev_ashift, done, private, ZIO_TYPE_IOCTL, + zio = zio_create(pio, spa, 0, NULL, NULL, + size, size, done, private, ZIO_TYPE_IOCTL, auto_trim ? ZIO_PRIORITY_AUTO_TRIM : ZIO_PRIORITY_MAN_TRIM, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_AGGREGATE, vd, off, From d27976d2d313698dc58b3621d61e2973937152d1 Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Thu, 13 Apr 2017 15:30:55 +0200 Subject: [PATCH 09/38] Matt Ahrens' review comments. Porting Notes: Man page changes dropped for the moment. This can be reconsiled when the final version is merged to OpenZFS. They are accurate now, only worded a little differently. Requires-builders: none --- module/zfs/metaslab.c | 3 +++ module/zfs/range_tree.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index a8dd20c9c02f..f5e6bbb7f93b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4524,6 +4524,8 @@ static void metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) { metaslab_t *msp = arg; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_cur_ts != NULL); range_tree_add(msp->ms_cur_ts->ts_tree, offset, size); if (msp->ms_prev_ts != NULL) { @@ -4830,6 +4832,7 @@ static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, uint64_t new_offset; ASSERT3U(*offset + size, <=, limit); + ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_trimming_ts == NULL) /* no trim conflict, original offset is OK */ diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 2e11e481249e..733b3e6c1c17 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -521,6 +521,8 @@ uint64_t range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs; + + ASSERT(MUTEX_HELD(rt->rt_lock)); while ((rs = range_tree_find_impl(rt, start, size)) != NULL) start = rs->rs_end; return (start); @@ -531,6 +533,7 @@ range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { range_seg_t *rs; + ASSERT(MUTEX_HELD(rt->rt_lock)); rs = range_tree_find(rt, off, size); if (rs != NULL) panic("freeing free block; rs=%p", (void *)rs); From 8fb4ccf16262a5e7d543f2aae1a0b73ee44f35f0 Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Thu, 27 Apr 2017 01:36:20 +0200 Subject: [PATCH 10/38] Matt Ahrens' review comments, round 3. 1) Removed the first-fit allocator. 2) Moved the autotrim metaslab scheduling logic into vdev_auto_trim. 2a) As a consequence of #2, metaslab_trimset_t was rendered superfluous. New trimsets are simple range_tree_t's. 3) Made ms_trimming_ts remove extents it is working on from ms_tree and then add them back in. 3a) As a consequence of #3, undone all the direct changes to the allocators and removed metaslab_check_trim_conflict and range_tree_find_gap. Porting Notes: * Removed WITH_*_ALLOCATOR macros and aligned remaining allocations with OpenZFS. Unused wariables warnings resolved with the gcc __attribute__ ((unused__ keyword. * Added missing calls for ms_condensing_cv. Signed-off-by: Brian Behlendorf Requires-builders: none --- include/sys/metaslab.h | 2 +- include/sys/metaslab_impl.h | 11 +- include/sys/range_tree.h | 1 - module/zfs/metaslab.c | 457 ++++++++++++++---------------------- module/zfs/range_tree.c | 17 -- module/zfs/vdev.c | 48 +++- module/zfs/zio.c | 2 +- 7 files changed, 225 insertions(+), 313 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 610720b8ebe8..2d2d74d84505 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -57,7 +57,7 @@ void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); -void metaslab_auto_trim(metaslab_t *, uint64_t, boolean_t); +void metaslab_auto_trim(metaslab_t *, boolean_t); uint64_t metaslab_trim_mem_used(metaslab_t *); /* diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 56967e9a5ab0..75e80ff8b7bd 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -279,11 +279,6 @@ struct metaslab_group { kcondvar_t mg_ms_initialize_cv; }; -typedef struct { - uint64_t ts_birth; /* TXG at which this trimset starts */ - range_tree_t *ts_tree; /* tree of extents in the trimset */ -} metaslab_trimset_t; - /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. @@ -358,10 +353,10 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; - metaslab_trimset_t *ms_cur_ts; /* currently prepared trims */ - metaslab_trimset_t *ms_prev_ts; /* previous (aging) trims */ + range_tree_t *ms_cur_ts; /* currently prepared trims */ + range_tree_t *ms_prev_ts; /* previous (aging) trims */ kcondvar_t ms_trim_cv; - metaslab_trimset_t *ms_trimming_ts; + range_tree_t *ms_trimming_ts; /* in flight trims */ /* * The following range trees are accessed only from syncing context. diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 4d1aaba5d53f..036720d3adb4 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -95,7 +95,6 @@ void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); boolean_t range_tree_contains_part(range_tree_t *rt, uint64_t start, uint64_t size); -uint64_t range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index f5e6bbb7f93b..ce387d931754 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -39,8 +39,6 @@ #include #include -#define WITH_DF_BLOCK_ALLOCATOR - #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) @@ -232,33 +230,6 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; #endif -/* - * How many TXG's worth of updates should be aggregated per TRIM/UNMAP - * issued to the underlying vdev. We keep two range trees of extents - * (called "trim sets") to be trimmed per metaslab, the `current' and - * the `previous' TS. New free's are added to the current TS. Then, - * once `zfs_txgs_per_trim' transactions have elapsed, the `current' - * TS becomes the `previous' TS and a new, blank TS is created to be - * the new `current', which will then start accumulating any new frees. - * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's - * extents are trimmed, the TS is destroyed and the current TS again - * becomes the previous TS. - * This serves to fulfill two functions: aggregate many small frees - * into fewer larger trim operations (which should help with devices - * which do not take so kindly to them) and to allow for disaster - * recovery (extents won't get trimmed immediately, but instead only - * after passing this rather long timeout, thus preserving - * 'zfs import -F' functionality). - * The exact default value of this tunable is a tradeoff between: - * 1) Keeping the trim commands reasonably small. - * 2) Keeping the ability to rollback back for as many txgs as possible. - * 3) Waiting around too long that the user starts to get uneasy about not - * seeing any space being freed after they remove some files. - * The default value of 32 is the maximum number of uberblocks in a vdev - * label, assuming a 4k physical sector size (which seems to be the almost - * universal smallest sector size used in SSDs). - */ -unsigned int zfs_txgs_per_trim = 32; /* * Maximum number of bytes we'll put into a single zio_trim. This is for * vdev queue processing purposes and also because some devices advertise @@ -269,13 +240,11 @@ uint64_t zfs_max_bytes_per_trim = 128 << 20; static void metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size); static void metaslab_trim_add(void *arg, uint64_t offset, uint64_t size); +static uint64_t metaslab_trimming_space(const metaslab_t *msp); static zio_t *metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim); -static metaslab_trimset_t *metaslab_new_trimset(uint64_t txg, kmutex_t *lock); -static void metaslab_free_trimset(metaslab_trimset_t *ts); -static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, - uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit); +static void metaslab_free_trimset(range_tree_t *ts); /* * ========================================================================== @@ -582,7 +551,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) } msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + - msp->ms_deferspace + range_tree_space(msp->ms_freed); + msp->ms_deferspace + range_tree_space(msp->ms_freed) + + metaslab_trimming_space(msp); VERIFY3U(sm_free_space, ==, msp_free_space); } @@ -1209,29 +1179,22 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) return (rs); } -#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ - defined(WITH_DF_BLOCK_ALLOCATOR) || \ - defined(WITH_CF_BLOCK_ALLOCATOR) /* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, - uint64_t size, uint64_t align) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); - for (; rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = P2ROUNDUP(rs->rs_start, align); - - if (offset + size <= rs->rs_end && - !metaslab_check_trim_conflict(msp, &offset, size, align, - rs->rs_end)) { - *cursor = offset + size; - return (offset); + while (rs != NULL) { + if (rs->rs_start + size <= rs->rs_end) { + *cursor = rs->rs_start + size; + return (rs->rs_start); } + rs = AVL_NEXT(t, rs); } /* @@ -1242,41 +1205,9 @@ metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(msp, t, cursor, size, align)); + return (metaslab_block_picker(t, cursor, size)); } -#endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ -#if defined(WITH_FF_BLOCK_ALLOCATOR) -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_allocatable->rt_root; - - return (metaslab_block_picker(msp, t, cursor, size, align)); -} - -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; -#endif /* WITH_FF_BLOCK_ALLOCATOR */ - -#if defined(WITH_DF_BLOCK_ALLOCATOR) /* * ========================================================================== * Dynamic block allocator - @@ -1319,17 +1250,13 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); + return (metaslab_block_picker(t, cursor, size)); } static metaslab_ops_t metaslab_df_ops = { metaslab_df_alloc }; -metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; -#endif /* WITH_DF_BLOCK_ALLOCATOR */ - -#if defined(WITH_CF_BLOCK_ALLOCATOR) /* * ========================================================================== * Cursor fit block allocator - @@ -1342,8 +1269,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { - range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + ASSERTV(range_tree_t *rt = msp->ms_allocatable); + ASSERTV(avl_tree_t *t = &msp->ms_allocatable_by_size); uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; @@ -1356,19 +1283,12 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - for (rs = avl_last(&msp->ms_allocatable_by_size); - rs != NULL && rs->rs_end - rs->rs_start >= size; - rs = AVL_PREV(&msp->allocatable_by_size)) { - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; - if (!metaslab_check_trim_conflict(msp, cursor, size, - 1, *cursor_end)) { - /* segment appears to be acceptable */ - break; - } - } - if (rs == NULL || rs->rs_end - rs->rs_start < size) + rs = avl_last(&msp->ms_size_tree); + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); + + *cursor = rs->rs_start; + *cursor_end = rs->rs_end; } offset = *cursor; @@ -1377,14 +1297,10 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) return (offset); } -static metaslab_ops_t metaslab_cf_ops = { +static metaslab_ops_t metaslab_cf_ops __attribute__((unused)) = { metaslab_cf_alloc }; -metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; -#endif /* WITH_CF_BLOCK_ALLOCATOR */ - -#if defined(WITH_NDF_BLOCK_ALLOCATOR) /* * ========================================================================== * New dynamic fit allocator - @@ -1409,8 +1325,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_block_maxsize(msp); - /* mutable copy for adjustment by metaslab_check_trim_conflict */ - uint64_t adjustable_start; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, @@ -1423,12 +1337,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); - if (rs != NULL) - adjustable_start = rs->rs_start; - if (rs == NULL || rs->rs_end - adjustable_start < size || - metaslab_check_trim_conflict(msp, &adjustable_start, size, 1, - rs->rs_end)) { - /* segment not usable, try the largest remaining one */ + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; @@ -1438,26 +1347,20 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); - adjustable_start = rs->rs_start; - if (rs->rs_end - adjustable_start < size || - metaslab_check_trim_conflict(msp, &adjustable_start, - size, 1, rs->rs_end)) { - /* even largest remaining segment not usable */ - return (-1ULL); - } } - *cursor = adjustable_start + size; - return (*cursor); + if ((rs->rs_end - rs->rs_start) >= size) { + *cursor = rs->rs_start + size; + return (rs->rs_start); + } + return (-1ULL); } -static metaslab_ops_t metaslab_ndf_ops = { +static metaslab_ops_t metaslab_ndf_ops __attribute__((unused)) = { metaslab_ndf_alloc }; -metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; -#endif /* WITH_NDF_BLOCK_ALLOCATOR */ - +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== @@ -1526,6 +1429,16 @@ metaslab_load_impl(metaslab_t *msp) range_tree_walk(msp->ms_defer[t], metaslab_trim_remove, msp); } + + /* + * If there's a trim ongoing, punch out the holes that will + * be filled back in in metaslab_trim_done. + */ + if (msp->ms_trimming_ts != NULL) { + range_tree_walk(msp->ms_trimming_ts, range_tree_remove, + msp->ms_tree); + } + msp->ms_max_size = metaslab_block_maxsize(msp); } msp->ms_max_size = metaslab_block_maxsize(msp); @@ -1592,7 +1505,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_trim_cv, NULL, CV_DEFAULT, NULL); - + cv_init(&ms->ms_condensing_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; @@ -1615,7 +1528,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ASSERT(ms->ms_sm != NULL); } - ms->ms_cur_ts = metaslab_new_trimset(0, &ms->ms_lock); + ms->ms_cur_ts = range_tree_create(NULL, NULL, &ms->ms_lock); /* * We create the main range tree here, but we don't create the @@ -1710,6 +1623,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_trim_cv); + cv_destroy(&msp->ms_condensing_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); @@ -2362,6 +2276,11 @@ metaslab_should_condense(metaslab_t *msp) uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); + if (msp->ms_trimming_ts != NULL) { + optimal_size += sizeof (uint64_t) * + avl_numnodes(&msp->ms_trimming_ts->rt_root); + } + dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); @@ -2390,7 +2309,9 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), + avl_numnodes(&msp->ms_allocatable->rt_root) + + (msp->ms_trimming_ts != NULL ? + avl_numnodes(&msp->ms_trimming_ts->rt_root) : 0), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; @@ -2446,9 +2367,24 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + if (msp->ms_trimming_ts == NULL) { + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + } else { + /* + * While trimming, the stuff being trimmed isn't in ms_tree, + * but we still want our persistent state to reflect that. So + * we construct a temporary union of the two trees. + */ + range_tree_t *rt = range_tree_create(NULL, NULL, &msp->ms_lock); + range_tree_walk(msp->ms_tree, range_tree_add, rt); + range_tree_walk(msp->ms_trimming_ts, range_tree_add, rt); + space_map_write(sm, rt, SM_FREE, SM_NO_VDEVID, tx); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + } mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; + cv_broadcast(&msp->ms_condensing_cv); } /* @@ -2569,6 +2505,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + if (msp->ms_trimming_ts != NULL) { + /* Stuff currently being trimmed is also free. */ + space_map_histogram_add(msp->ms_sm, + msp->ms_trimming_ts, tx); + } + /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the @@ -3370,8 +3312,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize) || - msp->ms_trimming_ts != NULL); + ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -3920,8 +3861,16 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_allocatable) + size + + metaslab_trimming_space(msp), <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY(!range_tree_contains(msp->ms_allocating[txg & TXG_MASK], + offset, size)); + if (msp->ms_trimming_ts != NULL) { + VERIFY(!range_tree_contains(msp->ms_trimming_ts, + offset, size)); + } range_tree_add(msp->ms_allocatable, offset, size); if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON && !vd->vdev_man_trimming) @@ -4356,14 +4305,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) if (msp->ms_loaded) { VERIFY(&msp->ms_lock == msp->ms_tree->rt_lock); range_tree_verify(msp->ms_allocatable, offset, size); + if (msp->ms_trimming_ts) { + range_tree_verify(msp->ms_trimming_ts, + offset, size); + } #ifdef DEBUG - VERIFY(&msp->ms_lock == msp->ms_cur_ts->ts_tree->rt_lock); - range_tree_verify(msp->ms_cur_ts->ts_tree, offset, size); + VERIFY3P(&msp->ms_lock, ==, msp->ms_cur_ts->rt_lock); + range_tree_verify(msp->ms_cur_ts, offset, size); if (msp->ms_prev_ts != NULL) { - VERIFY(&msp->ms_lock == - msp->ms_prev_ts->ts_tree->rt_lock); - range_tree_verify(msp->ms_prev_ts->ts_tree, - offset, size); + VERIFY3P(&msp->ms_lock, ==, msp->ms_prev_ts->rt_lock); + range_tree_verify(msp->ms_prev_ts, offset, size); } #endif } @@ -4445,7 +4396,7 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, * from the last cursor position, but not more than the trim run * limit. */ - range_tree_vacate(msp->ms_cur_ts->ts_tree, NULL, NULL); + range_tree_vacate(msp->ms_cur_ts, NULL, NULL); rsearch.rs_start = cur; rsearch.rs_end = cur + SPA_MINBLOCKSIZE; @@ -4477,7 +4428,7 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, if (trimmed_space != 0) { /* Force this trim to take place ASAP. */ msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = metaslab_new_trimset(0, &msp->ms_lock); + msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); trim_io = metaslab_exec_trim(msp, B_FALSE); ASSERT(trim_io != NULL); @@ -4508,11 +4459,11 @@ metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size) { metaslab_t *msp = arg; - range_tree_clear(msp->ms_cur_ts->ts_tree, offset, size); + range_tree_clear(msp->ms_cur_ts, offset, size); if (msp->ms_prev_ts != NULL) - range_tree_clear(msp->ms_prev_ts->ts_tree, offset, size); + range_tree_clear(msp->ms_prev_ts, offset, size); ASSERT(msp->ms_trimming_ts == NULL || - !range_tree_contains(msp->ms_trimming_ts->ts_tree, offset, size)); + !range_tree_contains(msp->ms_trimming_ts, offset, size)); } /* @@ -4527,16 +4478,25 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_cur_ts != NULL); - range_tree_add(msp->ms_cur_ts->ts_tree, offset, size); - if (msp->ms_prev_ts != NULL) { - ASSERT(!range_tree_contains_part(msp->ms_prev_ts->ts_tree, - offset, size)); - } + range_tree_add(msp->ms_cur_ts, offset, size); + ASSERT(msp->ms_prev_ts == NULL || + !range_tree_contains_part(msp->ms_prev_ts, offset, size)); } /* - * Does a metaslab's automatic trim operation processing. This function - * issues trims in intervals as dictated by the zfs_txgs_per_trim tunable. + * Returns the amount of space currently being trimmed. + */ +static uint64_t +metaslab_trimming_space(const metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_trimming_ts == NULL) + return (0); + return (range_tree_space(msp->ms_trimming_ts)); +} + +/* + * Does a metaslab's automatic trim operation processing. * If the previous trimset has not yet finished trimming, this function * decides what to do based on `preserve_spilled'. If preserve_spilled is * false, the next trimset which would have been issued is simply dropped to @@ -4544,78 +4504,55 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) * trimset. */ void -metaslab_auto_trim(metaslab_t *msp, uint64_t txg, boolean_t preserve_spilled) +metaslab_auto_trim(metaslab_t *msp, boolean_t preserve_spilled) { - /* for atomicity */ - uint64_t txgs_per_trim = zfs_txgs_per_trim; - ASSERT(!MUTEX_HELD(&msp->ms_lock)); mutex_enter(&msp->ms_lock); /* - * Since we typically have hundreds of metaslabs per vdev, but we only - * trim them once every zfs_txgs_per_trim txgs, it'd be best if we - * could sequence the TRIM commands from all metaslabs so that they - * don't all always pound the device in the same txg. We do so by - * artificially inflating the birth txg of the first trim set by a - * sequence number derived from the metaslab's starting offset - * (modulo zfs_txgs_per_trim). Thus, for the default 200 metaslabs and - * 32 txgs per trim, we'll only be trimming ~6.25 metaslabs per txg. - * - * If we detect that the txg has advanced too far ahead of ts_birth, - * it means our birth txg is out of lockstep. Recompute it by - * rounding down to the nearest zfs_txgs_per_trim multiple and adding - * our metaslab id modulo zfs_txgs_per_trim. + * Always swap out the current and previous trimsets. Normally this + * should be done at intervals of zfs_txgs_per_trim. The code which + * controls this is in vdev_auto_trim. */ - if (txg > msp->ms_cur_ts->ts_birth + txgs_per_trim) { - msp->ms_cur_ts->ts_birth = (txg / txgs_per_trim) * - txgs_per_trim + (msp->ms_id % txgs_per_trim); - } - - /* Time to swap out the current and previous trimsets */ - if (txg == msp->ms_cur_ts->ts_birth + txgs_per_trim) { - if (msp->ms_prev_ts != NULL) { - if (msp->ms_trimming_ts != NULL) { - spa_t *spa = msp->ms_group->mg_class->mc_spa; - /* - * The previous trim run is still ongoing, so - * the device is reacting slowly to our trim - * requests. Drop this trimset, so as not to - * back the device up with trim requests. - */ - if (preserve_spilled) { - DTRACE_PROBE1(preserve__spilled, - metaslab_t *, msp); - range_tree_vacate( - msp->ms_prev_ts->ts_tree, - range_tree_add, - msp->ms_cur_ts->ts_tree); - } else { - DTRACE_PROBE1(drop__spilled, - metaslab_t *, msp); - spa_trimstats_auto_slow_incr(spa); - } - metaslab_free_trimset(msp->ms_prev_ts); - } else if (msp->ms_group->mg_vd->vdev_man_trimming) { - /* - * If a manual trim is ongoing, we want to - * inhibit autotrim temporarily so it doesn't - * slow down the manual trim. - */ - metaslab_free_trimset(msp->ms_prev_ts); + if (msp->ms_prev_ts != NULL) { + if (msp->ms_trimming_ts != NULL) { + spa_t *spa = msp->ms_group->mg_class->mc_spa; + /* + * The previous trim run is still ongoing, so the + * device is reacting slowly to trims. Consider + * dropping this trimset, so as not to back the + * device up. + */ + if (preserve_spilled) { + DTRACE_PROBE1(preserve__spilled, + metaslab_t *, msp); + range_tree_vacate(msp->ms_prev_ts, + range_tree_add, msp->ms_cur_ts); } else { - /* - * Trim out aged extents on the vdevs - these - * are safe to be destroyed now. We'll keep - * the trimset around to deny allocations from - * these regions while the trims are ongoing. - */ - zio_nowait(metaslab_exec_trim(msp, B_TRUE)); + DTRACE_PROBE1(drop__spilled, metaslab_t *, msp); + spa_trimstats_auto_slow_incr(spa); } + metaslab_free_trimset(msp->ms_prev_ts); + } else if (msp->ms_group->mg_vd->vdev_man_trimming) { + /* + * If a manual trim is ongoing, we want to inhibit + * autotrim temporarily so it doesn't slow down the + * manual trim. + */ + metaslab_free_trimset(msp->ms_prev_ts); + } else { + /* + * Trim out aged extents on the vdevs - these are safe + * to be destroyed now. We'll keep the trimset around + * to deny allocations from these regions while the + * trims are ongoing. + */ + zio_nowait(metaslab_exec_trim(msp, B_TRUE)); } - msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = metaslab_new_trimset(txg, &msp->ms_lock); } + msp->ms_prev_ts = msp->ms_cur_ts; + msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); + mutex_exit(&msp->ms_lock); } @@ -4629,15 +4566,15 @@ metaslab_auto_trim(metaslab_t *msp, uint64_t txg, boolean_t preserve_spilled) * get it "close enough". */ static uint64_t -metaslab_trimset_mem_used(metaslab_trimset_t *ts) +metaslab_trimset_mem_used(range_tree_t *ts) { uint64_t result = 0; - result += avl_numnodes(&ts->ts_tree->rt_root) * (sizeof (range_seg_t) + + result += avl_numnodes(&ts->rt_root) * (sizeof (range_seg_t) + sizeof (dkioc_free_list_ext_t)); - result += ((range_tree_space(ts->ts_tree) / zfs_max_bytes_per_trim) + - 1) * sizeof (zio_t); - result += sizeof (range_tree_t) + sizeof (metaslab_trimset_t); + result += ((range_tree_space(ts) / zfs_max_bytes_per_trim) + 1) * + sizeof (zio_t); + result += sizeof (range_tree_t); return (result); } @@ -4672,6 +4609,10 @@ metaslab_trim_done(zio_t *zio) held = MUTEX_HELD(&msp->ms_lock); if (!held) mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) { + range_tree_walk(msp->ms_trimming_ts, range_tree_add, + msp->ms_tree); + } metaslab_free_trimset(msp->ms_trimming_ts); msp->ms_trimming_ts = NULL; cv_broadcast(&msp->ms_trim_cv); @@ -4717,24 +4658,33 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) cv_wait(&msp->ms_trim_cv, &msp->ms_lock); msp->ms_trimming_ts = msp->ms_prev_ts; msp->ms_prev_ts = NULL; - trim_tree = msp->ms_trimming_ts->ts_tree; -#ifdef DEBUG + trim_tree = msp->ms_trimming_ts; + if (msp->ms_loaded) { for (range_seg_t *rs = avl_first(&trim_tree->rt_root); rs != NULL; rs = AVL_NEXT(&trim_tree->rt_root, rs)) { +#ifdef DEBUG if (!range_tree_contains_part(msp->ms_tree, rs->rs_start, rs->rs_end - rs->rs_start)) { panic("trimming allocated region; rs=%p", (void*)rs); } +#endif /* DEBUG */ + /* + * To avoid allocating from the range of extents we're + * currently destroying, temporarily remove them from + * the tree of free space. They'll then be added back + * in in metaslab_trim_done. + */ + range_tree_remove(msp->ms_tree, rs->rs_start, + rs->rs_end - rs->rs_start); } } -#endif /* Nothing to trim */ if (range_tree_space(trim_tree) == 0) { metaslab_free_trimset(msp->ms_trimming_ts); - msp->ms_trimming_ts = 0; + msp->ms_trimming_ts = NULL; return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); } @@ -4786,67 +4736,13 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) } /* - * Allocates and initializes a new trimset structure. The `txg' argument - * indicates when this trimset was born and `lock' indicates the lock to - * link to the range tree. - */ -static metaslab_trimset_t * -metaslab_new_trimset(uint64_t txg, kmutex_t *lock) -{ - metaslab_trimset_t *ts; - - ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); - ts->ts_birth = txg; - ts->ts_tree = range_tree_create(NULL, NULL, lock); - - return (ts); -} - -/* - * Destroys and frees a trim set previously allocated by metaslab_new_trimset. + * Destroys and frees a trim set. */ static void -metaslab_free_trimset(metaslab_trimset_t *ts) +metaslab_free_trimset(range_tree_t *ts) { - range_tree_vacate(ts->ts_tree, NULL, NULL); - range_tree_destroy(ts->ts_tree); - kmem_free(ts, sizeof (*ts)); -} - -/* - * Checks whether an allocation conflicts with an ongoing trim operation in - * the given metaslab. This function takes a segment starting at `*offset' - * of `size' and checks whether it hits any region in the metaslab currently - * being trimmed. If yes, it tries to adjust the allocation to the end of - * the region being trimmed (P2ROUNDUP aligned by `align'), but only up to - * `limit' (no part of the allocation is allowed to go past this point). - * - * Returns B_FALSE if either the original allocation wasn't in conflict, or - * the conflict could be resolved by adjusting the value stored in `offset' - * such that the whole allocation still fits below `limit'. Returns B_TRUE - * if the allocation conflict couldn't be resolved. - */ -static boolean_t metaslab_check_trim_conflict(metaslab_t *msp, - uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit) -{ - uint64_t new_offset; - - ASSERT3U(*offset + size, <=, limit); - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - if (msp->ms_trimming_ts == NULL) - /* no trim conflict, original offset is OK */ - return (B_FALSE); - - new_offset = P2ROUNDUP(range_tree_find_gap(msp->ms_trimming_ts->ts_tree, - *offset, size), align); - if (new_offset + size > limit) - /* trim conflict and adjustment not possible */ - return (B_TRUE); - - /* trim conflict, but adjusted offset still within limit */ - *offset = new_offset; - return (B_FALSE); + range_tree_vacate(ts, NULL, NULL); + range_tree_destroy(ts); } #if defined(_KERNEL) @@ -4899,13 +4795,8 @@ module_param(zfs_metaslab_switch_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_switch_threshold, "segment-based metaslab selection maximum buckets before switching"); -module_param(zfs_txgs_per_trim, int, 0644); -MODULE_PARM_DESC(zfs_txgs_per_trim, - "txgs per trim"); - module_param(metaslab_force_ganging, ulong, 0644); MODULE_PARM_DESC(metaslab_force_ganging, "blocks larger than this size are forced to be gang blocks"); /* END CSTYLED */ - #endif diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 733b3e6c1c17..29827b750f92 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -511,23 +511,6 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) return (NULL); } -/* - * Given an extent start offset and size, will look through the provided - * range tree and find a suitable start offset (starting at `start') such - * that the requested extent _doesn't_ overlap with any range segment in - * the range tree. - */ -uint64_t -range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_seg_t *rs; - - ASSERT(MUTEX_HELD(rt->rt_lock)); - while ((rs = range_tree_find_impl(rt, start, size)) != NULL) - start = rs->rs_end; - return (start); -} - void range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 22780cbfdb2a..8389343d9478 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -209,6 +209,34 @@ static vdev_ops_t *vdev_ops_table[] = { */ int zfs_trim_mem_lim_fact = 50; +/* + * How many TXG's worth of updates should be aggregated per TRIM/UNMAP + * issued to the underlying vdev. We keep two range trees of extents + * (called "trim sets") to be trimmed per metaslab, the `current' and + * the `previous' TS. New free's are added to the current TS. Then, + * once `zfs_txgs_per_trim' transactions have elapsed, the `current' + * TS becomes the `previous' TS and a new, blank TS is created to be + * the new `current', which will then start accumulating any new frees. + * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's + * extents are trimmed, the TS is destroyed and the current TS again + * becomes the previous TS. + * This serves to fulfill two functions: aggregate many small frees + * into fewer larger trim operations (which should help with devices + * which do not take so kindly to them) and to allow for disaster + * recovery (extents won't get trimmed immediately, but instead only + * after passing this rather long timeout, thus preserving + * 'zfs import -F' functionality). + * The exact default value of this tunable is a tradeoff between: + * 1) Keeping the trim commands reasonably small. + * 2) Keeping the ability to rollback back for as many txgs as possible. + * 3) Waiting around too long that the user starts to get uneasy about not + * seeing any space being freed after they remove some files. + * The default value of 32 is the maximum number of uberblocks in a vdev + * label, assuming a 4k physical sector size (which seems to be the almost + * universal smallest sector size used in SSDs). + */ +unsigned int zfs_txgs_per_trim = 32; + /* * Given a vdev type, return the appropriate ops vector. */ @@ -4852,6 +4880,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) vdev_t *vd = vti->vti_vdev; spa_t *spa = vd->vdev_spa; uint64_t txg = vti->vti_txg; + uint64_t txgs_per_trim = zfs_txgs_per_trim; uint64_t mlim = 0, mused = 0; boolean_t limited; @@ -4868,8 +4897,20 @@ vdev_auto_trim(vdev_trim_info_t *vti) limited = mused > mlim; DTRACE_PROBE3(autotrim__mem__lim, vdev_t *, vd, uint64_t, mused, uint64_t, mlim); - for (uint64_t i = 0; i < vd->vdev_ms_count; i++) - metaslab_auto_trim(vd->vdev_ms[i], txg, !limited); + + /* + * Since we typically have hundreds of metaslabs per vdev, but we only + * trim them once every zfs_txgs_per_trim txgs, it'd be best if we + * could sequence the TRIM commands from all metaslabs so that they + * don't all always pound the device in the same txg. We do so taking + * the txg number modulo txgs_per_trim and then skipping by + * txgs_per_trim. Thus, for the default 200 metaslabs and 32 + * txgs_per_trim, we'll only be trimming ~6.25 metaslabs per txg. + */ + for (uint64_t i = txg % txgs_per_trim; i < vd->vdev_ms_count; + i += txgs_per_trim) + metaslab_auto_trim(vd->vdev_ms[i], !limited); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); out: @@ -4971,5 +5012,8 @@ MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); module_param(zfs_trim_mem_lim_fact, int, 0644); MODULE_PARM_DESC(metaslabs_per_vdev, "Maximum percentage of physical memory " "to be used for storing trim extents"); + +module_param(zfs_txgs_per_trim, int, 0644); +MODULE_PARM_DESC(zfs_txgs_per_trim, "Number of txgs per trim"); /* END CSTYLED */ #endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c9ad87482432..0b283f62b86b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1327,7 +1327,7 @@ zio_trim_check(uint64_t start, uint64_t len, void *msp) mutex_enter(&ms->ms_lock); ASSERT(ms->ms_trimming_ts != NULL); if (ms->ms_loaded) - ASSERT(range_tree_contains(ms->ms_trimming_ts->ts_tree, + ASSERT(range_tree_contains(ms->ms_trimming_ts, start - VDEV_LABEL_START_SIZE, len)); if (!held) mutex_exit(&ms->ms_lock); From 76439d553676e5597d1c4461a482e6e97f39dcd8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 22 May 2017 13:12:30 -0400 Subject: [PATCH 11/38] Tim Chase's review comments, round 2. Porting Notes: * metaslab_sync changes already applied. * resync of test cases needed Requires-builders: none --- module/zfs/vdev.c | 9 +++++---- .../zfs-tests/tests/functional/trim/autotrim_001_pos.ksh | 2 +- tests/zfs-tests/tests/functional/trim/trim.kshlib | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8389343d9478..3fa5a6fd73e5 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4829,6 +4829,7 @@ vdev_man_trim(vdev_trim_info_t *vti) for (;;) { uint64_t rate = spa->spa_man_trim_rate; uint64_t sleep_delay; + clock_t t1; if (rate == 0) { /* No delay, just update 't' and move on. */ @@ -4838,16 +4839,16 @@ vdev_man_trim(vdev_trim_info_t *vti) sleep_delay = (delta * hz) / rate; mutex_enter(&spa->spa_man_trim_lock); - (void) cv_timedwait(&spa->spa_man_trim_update_cv, - &spa->spa_man_trim_lock, t); + t1 = cv_timedwait(&spa->spa_man_trim_update_cv, + &spa->spa_man_trim_lock, t + sleep_delay); mutex_exit(&spa->spa_man_trim_lock); /* If interrupted, don't try to relock, get out */ if (spa->spa_man_trim_stop) goto out; - /* Timeout passed, move on to the next metaslab. */ - if (ddi_get_lbolt() >= t + sleep_delay) { + /* Timeout passed, move on to the next chunk. */ + if (t1 == -1) { t += sleep_delay; break; } diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh index fc74bb7bf570..239ce86eb9cf 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh @@ -39,7 +39,7 @@ function getsizemb { typeset rval - rval=$(du --block-size 1048576 -s "$1" | sed -e 's;[ ].*;;') + rval=$(du --block-size 1048576 -s "$1" | awk '{print $1}') echo -n "$rval" } diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib index 041c1f0754b7..d1b35f0aa46d 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.kshlib +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -25,7 +25,8 @@ function set_tunable function find_scsi_debug { - grep -H scsi_debug /sys/block/*/device/model | $AWK -F/ '{print $4}' | tr '\n' ' ' + grep -H scsi_debug /sys/block/*/device/model | \ + awk -F/ '{print $4}' | tr '\n' ' ' } function setupvdevs From f4767477a89bcefcd5fd6d1767806cb6c1aaeec1 Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Fri, 12 May 2017 12:23:43 +0200 Subject: [PATCH 12/38] Matt Ahren's review comments round 4: 1) Simplified the SM_FREE spacemap writing while a trim is active. 2) Simplified the range_tree_verify in metaslab_check_free. 3) Clarified comment above metaslab_trim_all. 4) Substituted 'flust out' with 'drop' in comment in metaslab_trim_all. 5) Moved ms_prev_ts clearing up to ms_cur_ts claring in metaslab_trim_all. 6) Added recomputation of metaslab weight when metaslab is loaded. 7) Moved dmu_tx_commit inside of spa_trim_update_time. 8) Made the smallest allowable manual trim rate 1/1000th of a metaslab size. 9) Switched to using hrtime_t in manual trim timing logic. 10) Changed "limited" to "preserve_spilled" in vdev_auto_trim. 11) Moved vdev_notrim setting into zio_vdev_io_assess.a Porting Notes: * vdev_disk.c and zio.c hunks already applied. * nsec_per_tick -> MSEC2NSEC(1) Requires-builders: none --- module/zfs/metaslab.c | 101 +++++++++++++++++++++++------------------- module/zfs/spa.c | 42 ++++++++---------- module/zfs/vdev.c | 28 +++++++----- 3 files changed, 92 insertions(+), 79 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ce387d931754..f4f01a427486 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2367,21 +2367,10 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - if (msp->ms_trimming_ts == NULL) { - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); - } else { - /* - * While trimming, the stuff being trimmed isn't in ms_tree, - * but we still want our persistent state to reflect that. So - * we construct a temporary union of the two trees. - */ - range_tree_t *rt = range_tree_create(NULL, NULL, &msp->ms_lock); - range_tree_walk(msp->ms_tree, range_tree_add, rt); - range_tree_walk(msp->ms_trimming_ts, range_tree_add, rt); - space_map_write(sm, rt, SM_FREE, SM_NO_VDEVID, tx); - range_tree_vacate(rt, NULL, NULL); - range_tree_destroy(rt); - } + space_map_write(sm, msp->ms_tree, SM_FREE, tx); + if (msp->ms_trimming_ts != NULL) + space_map_write(sm, msp->ms_trimming_ts, SM_FREE, SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; cv_broadcast(&msp->ms_condensing_cv); @@ -4302,23 +4291,14 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - if (msp->ms_loaded) { - VERIFY(&msp->ms_lock == msp->ms_tree->rt_lock); + if (msp->ms_loaded) range_tree_verify(msp->ms_allocatable, offset, size); - if (msp->ms_trimming_ts) { - range_tree_verify(msp->ms_trimming_ts, - offset, size); - } -#ifdef DEBUG - VERIFY3P(&msp->ms_lock, ==, msp->ms_cur_ts->rt_lock); - range_tree_verify(msp->ms_cur_ts, offset, size); - if (msp->ms_prev_ts != NULL) { - VERIFY3P(&msp->ms_lock, ==, msp->ms_prev_ts->rt_lock); - range_tree_verify(msp->ms_prev_ts, offset, size); - } -#endif - } - + if (msp->ms_trimming_ts) + range_tree_verify(msp->ms_trimming_ts, offset, size); + ASSERT(msp->ms_cur_ts != NULL); + range_tree_verify(msp->ms_cur_ts, offset, size); + if (msp->ms_prev_ts != NULL) + range_tree_verify(msp->ms_prev_ts, offset, size); range_tree_verify(msp->ms_freeing, offset, size); range_tree_verify(msp->ms_checkpointing, offset, size); range_tree_verify(msp->ms_freed, offset, size); @@ -4351,17 +4331,32 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) } /* - * Trims all free space in the metaslab. Returns the root TRIM zio (that the - * caller should zio_wait() for) and the amount of space in the metaslab that - * has been scheduled for trimming in the `delta' return argument. + * This is used to trim all free space in a metaslab. The caller must + * initially set 'cursor' to the start offset of the metaslab. This function + * then walks the free space starting at or after this cursor and composes a + * TRIM zio for it. The function limits the number of bytes placed into the + * TRIM zio to at most zfs_max_bytes_per_trim. If the limit was hit before + * trimming all free space in the metaslab, the 'cursor' is updated to the + * last place we left off. The caller should keep calling this function in + * a loop as long as there is more space to trim. The function returns a TRIM + * zio that the caller should zio_wait for. If there is no more free space to + * trim in this metaslab, the function returns NULL instead. The 'delta' + * return argument contains the number of bytes scheduled for trimming in the + * returned TRIM zio. + * During execution, this function needs to load the metaslab. 'was_loaded' + * is an external state variable that is used to determine if the metaslab + * load was initiated by us and therefore whether we should unload the + * metaslab once we're done. */ zio_t * metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, boolean_t *was_loaded) { - uint64_t cur = *cursor, trimmed_space = 0; + uint64_t cur = *cursor; + uint64_t trimmed_space = 0; zio_t *trim_io = NULL; - range_seg_t rsearch, *rs; + range_seg_t rsearch; + range_seg_t *rs; avl_index_t where; const uint64_t max_bytes = zfs_max_bytes_per_trim; @@ -4392,12 +4387,17 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, } /* - * Flush out any scheduled extents and add everything in ms_tree - * from the last cursor position, but not more than the trim run - * limit. + * Drop any scheduled extents and add everything in ms_tree from + * the last cursor position, but not more than the trim run limit. */ range_tree_vacate(msp->ms_cur_ts, NULL, NULL); + /* Clear out ms_prev_ts, since we'll be trimming everything. */ + if (msp->ms_prev_ts != NULL) { + metaslab_free_trimset(msp->ms_prev_ts); + msp->ms_prev_ts = NULL; + } + rsearch.rs_start = cur; rsearch.rs_end = cur + SPA_MINBLOCKSIZE; rs = avl_find(&msp->ms_tree->rt_root, &rsearch, &where); @@ -4407,12 +4407,6 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, cur = rs->rs_start; } - /* Clear out ms_prev_ts, since we'll be trimming everything. */ - if (msp->ms_prev_ts != NULL) { - metaslab_free_trimset(msp->ms_prev_ts); - msp->ms_prev_ts = NULL; - } - while (rs != NULL && trimmed_space < max_bytes) { uint64_t end; if (cur < rs->rs_start) @@ -4481,6 +4475,11 @@ metaslab_trim_add(void *arg, uint64_t offset, uint64_t size) range_tree_add(msp->ms_cur_ts, offset, size); ASSERT(msp->ms_prev_ts == NULL || !range_tree_contains_part(msp->ms_prev_ts, offset, size)); + /* + * This might have been called from the manual trim code path + * while an autotrim is demolishing this extent, so we can't + * ASSERT against ms_trimming_ts here. + */ } /* @@ -4688,6 +4687,18 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); } + if (msp->ms_loaded) { + /* + * Recompute of the metaslab's weight & resort it. This is only + * done when we're loaded, because then the trim_tree will have + * affected ms_tree and its histogram. We cannot adjust the + * histogram for the on-disk spacemap, however, because we + * don't know which buckets to alter with what we have in + * trim_tree. + */ + metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp)); + } + if (auto_trim) { uint64_t start = 0; range_seg_t *rs; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 12042ccd2d80..0b99ba4c2e21 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8543,7 +8543,7 @@ spa_trim_update_time_sync(void *arg, dmu_tx_t *tx) * Passing UINT64_MAX for either start_time or stop_time means that no * update to that value should be recorded. */ -static dmu_tx_t * +static void spa_trim_update_time(spa_t *spa, uint64_t start_time, uint64_t stop_time) { int err; @@ -8558,12 +8558,11 @@ spa_trim_update_time(spa_t *spa, uint64_t start_time, uint64_t stop_time) err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); - return (NULL); + return; } dsl_sync_task_nowait(spa_get_dsl(spa), spa_trim_update_time_sync, spa, 1, ZFS_SPACE_CHECK_RESERVED, tx); - - return (tx); + dmu_tx_commit(tx); } /* @@ -8611,11 +8610,8 @@ spa_man_trim(spa_t *spa, uint64_t rate) (void (*)(void *))vdev_man_trim, vti, TQ_SLEEP); } spa_config_exit(spa, SCL_CONFIG, FTAG); - time_update_tx = spa_trim_update_time(spa, gethrestime_sec(), 0); + spa_trim_update_time(spa, gethrestime_sec(), 0); mutex_exit(&spa->spa_man_trim_lock); - /* mustn't hold spa_man_trim_lock to prevent deadlock /w syncing ctx */ - if (time_update_tx != NULL) - dmu_tx_commit(time_update_tx); } /* @@ -8697,24 +8693,20 @@ spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate, static void spa_vdev_man_trim_done(spa_t *spa) { - dmu_tx_t *time_update_tx = NULL; - mutex_enter(&spa->spa_man_trim_lock); ASSERT(spa->spa_num_man_trimming > 0); spa->spa_num_man_trimming--; if (spa->spa_num_man_trimming == 0) { /* if we were interrupted, leave stop_time at zero */ - if (!spa->spa_man_trim_stop) - time_update_tx = spa_trim_update_time(spa, UINT64_MAX, + if (!spa->spa_man_trim_stop) { + spa_trim_update_time(spa, UINT64_MAX, gethrestime_sec()); + } spa_event_notify(spa, NULL, NULL, ESC_ZFS_TRIM_FINISH); spa_async_request(spa, SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY); cv_broadcast(&spa->spa_man_trim_done_cv); } mutex_exit(&spa->spa_man_trim_lock); - - if (time_update_tx != NULL) - dmu_tx_commit(time_update_tx); } /* @@ -8734,13 +8726,15 @@ spa_vdev_auto_trim_done(spa_t *spa) /* * Determines the minimum sensible rate at which a manual TRIM can be - * performed on a given spa and returns it. Since we perform TRIM in - * metaslab-sized increments, we'll just let the longest step between - * metaslab TRIMs be 100s (random number, really). Thus, on a typical - * 200-metaslab vdev, the longest TRIM should take is about 5.5 hours. - * It *can* take longer if the device is really slow respond to - * zio_trim() commands or it contains more than 200 metaslabs, or - * metaslab sizes vary widely between top-level vdevs. + * performed on a given spa and returns it (in bytes per second). The + * value is calculated by assuming that TRIMming a metaslab should take + * no more than 1000s. The exact value here is not important, we just want + * to make sure that the calculated delay values in vdev_man_trim aren't + * too large (which might cause integer precision issues). Thus, on a + * typical 200-metaslab vdev, the longest TRIM should take is about 55 + * hours. It *can* take longer if the device is really slow respond to + * zio_trim() commands or it contains more than 200 metaslabs, or metaslab + * sizes vary widely between top-level vdevs. */ static uint64_t spa_min_trim_rate(spa_t *spa) @@ -8756,8 +8750,8 @@ spa_min_trim_rate(spa_t *spa) spa_config_exit(spa, SCL_CONFIG, FTAG); VERIFY(smallest_ms_sz != 0); - /* minimum TRIM rate is 1/100th of the smallest metaslab size */ - return (smallest_ms_sz / 100); + /* minimum TRIM rate is 1/1000th of the smallest metaslab size */ + return (smallest_ms_sz / 1000); } #if defined(_KERNEL) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3fa5a6fd73e5..79343dced86f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4789,7 +4789,7 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) void vdev_man_trim(vdev_trim_info_t *vti) { - clock_t t = ddi_get_lbolt(); + hrtime_t t = gethrtime(); spa_t *spa = vti->vti_vdev->vdev_spa; vdev_t *vd = vti->vti_vdev; uint64_t i, cursor; @@ -4828,19 +4828,20 @@ vdev_man_trim(vdev_trim_info_t *vti) /* delay loop to handle fixed-rate trimming */ for (;;) { uint64_t rate = spa->spa_man_trim_rate; - uint64_t sleep_delay; - clock_t t1; + hrtime_t sleep_delay; + hrtime_t t1; if (rate == 0) { /* No delay, just update 't' and move on. */ - t = ddi_get_lbolt(); + t = gethrtime(); break; } - sleep_delay = (delta * hz) / rate; + sleep_delay = SEC2NSEC(delta) / rate; mutex_enter(&spa->spa_man_trim_lock); - t1 = cv_timedwait(&spa->spa_man_trim_update_cv, - &spa->spa_man_trim_lock, t + sleep_delay); + t1 = cv_timedwait_hires(&spa->spa_man_trim_update_cv, + &spa->spa_man_trim_lock, t + sleep_delay, + MSEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); mutex_exit(&spa->spa_man_trim_lock); /* If interrupted, don't try to relock, get out */ @@ -4883,19 +4884,26 @@ vdev_auto_trim(vdev_trim_info_t *vti) uint64_t txg = vti->vti_txg; uint64_t txgs_per_trim = zfs_txgs_per_trim; uint64_t mlim = 0, mused = 0; - boolean_t limited; + boolean_t preserve_spilled; ASSERT3P(vd->vdev_top, ==, vd); if (vd->vdev_man_trimming) goto out; + /* + * In case trimming is slow and the previous trim run has no yet + * finished, we order metaslab_auto_trim to keep the extents that + * were about to be trimmed so that they can be trimmed in a future + * autotrim run. But we only do so if the amount of memory consumed + * by the extents doesn't exceed a threshold, otherwise we drop them. + */ spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); for (uint64_t i = 0; i < vd->vdev_ms_count; i++) mused += metaslab_trim_mem_used(vd->vdev_ms[i]); mlim = (physmem * PAGESIZE) / (zfs_trim_mem_lim_fact * spa->spa_root_vdev->vdev_children); - limited = mused > mlim; + preserve_spilled = mused < mlim; DTRACE_PROBE3(autotrim__mem__lim, vdev_t *, vd, uint64_t, mused, uint64_t, mlim); @@ -4910,7 +4918,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) */ for (uint64_t i = txg % txgs_per_trim; i < vd->vdev_ms_count; i += txgs_per_trim) - metaslab_auto_trim(vd->vdev_ms[i], !limited); + metaslab_auto_trim(vd->vdev_ms[i], preserve_spilled); spa_config_exit(spa, SCL_STATE_ALL, FTAG); From 5614f2b470cf594a663b4e6d4fef10672536f72a Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Thu, 18 May 2017 17:18:19 +0200 Subject: [PATCH 13/38] Deadlockiness associated with doing postponing trimming on a metaslab wanting to condense. Requires-builders: none --- cmd/zpool/zpool_main.c | 15 ++++++------ include/sys/vdev.h | 1 + module/zfs/metaslab.c | 54 +++++++++++++++++++++++++++++++----------- module/zfs/spa.c | 2 +- module/zfs/vdev.c | 38 +++++++++++++++-------------- 5 files changed, 69 insertions(+), 41 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 19cb8f682e01..a901cc08c5ca 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7070,7 +7070,6 @@ print_trim_status(uint64_t trim_prog, uint64_t total_size, uint64_t rate, time_t start_time = start_time_u64, end_time = end_time_u64; char *buf; - assert(trim_prog <= total_size); if (trim_prog != 0 && trim_prog != total_size) { buf = ctime(&start_time); buf[strlen(buf) - 1] = '\0'; /* strip trailing newline */ @@ -7078,12 +7077,12 @@ print_trim_status(uint64_t trim_prog, uint64_t total_size, uint64_t rate, char rate_str[32]; zfs_nicenum(rate, rate_str, sizeof (rate_str)); (void) printf(" trim: %.02f%%\tstarted: %s\t" - "(rate: %s/s)\n", (((double)trim_prog) / - total_size) * 100, buf, rate_str); + "(rate: %s/s)\n", MIN((((double)trim_prog) / + total_size) * 100, 100), buf, rate_str); } else { (void) printf(" trim: %.02f%%\tstarted: %s\t" - "(rate: max)\n", (((double)trim_prog) / - total_size) * 100, buf); + "(rate: max)\n", MIN((((double)trim_prog) / + total_size) * 100, 100), buf); } } else { if (start_time != 0) { @@ -7613,9 +7612,9 @@ status_callback(zpool_handle_t *zhp, void *data) * For whatever reason, root vdev_stats_t don't * include log devices. */ - print_trim_status(trim_prog, vs->vs_space + - zpool_slog_space(nvroot), trim_rate, - trim_start_time, trim_stop_time); + print_trim_status(trim_prog, (vs->vs_space - + vs->vs_alloc) + zpool_slog_space(nvroot), + trim_rate, trim_start_time, trim_stop_time); } (void) printf(gettext("config:\n\n")); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 9b09a2242706..87b1a95b0477 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -185,6 +185,7 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, extern void vdev_man_trim(vdev_trim_info_t *vti); extern void vdev_auto_trim(vdev_trim_info_t *vti); extern void vdev_trim_stop_wait(vdev_t *vd); +extern boolean_t vdev_trim_should_stop(vdev_t *vd); /* * Label routines diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index f4f01a427486..5bfb6d2f1169 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2479,7 +2479,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && metaslab_should_condense(msp)) { + if (msp->ms_loaded && spa_sync_pass(spa) == 1 && + metaslab_should_condense(msp) && msp->ms_trimming_ts == NULL) { metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); @@ -4366,9 +4367,6 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, mutex_enter(&msp->ms_lock); - while (msp->ms_condensing) - cv_wait(&msp->ms_condensing_cv, &msp->ms_lock); - while (msp->ms_loading) metaslab_load_wait(msp); /* @@ -4608,6 +4606,7 @@ metaslab_trim_done(zio_t *zio) held = MUTEX_HELD(&msp->ms_lock); if (!held) mutex_enter(&msp->ms_lock); + VERIFY(!msp->ms_condensing); if (msp->ms_loaded) { range_tree_walk(msp->ms_trimming_ts, range_tree_add, msp->ms_tree); @@ -4649,12 +4648,35 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) const enum zio_flag trim_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CONFIG_WRITER; + zio_t *zio = NULL; ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* + * TRIM and condense are mutually exclusive, because during TRIM + * we're manipulating ms_tree to remove the extents that we're + * currently trimming. Metaslab condensing takes priority. + */ + while (msp->ms_condensing) + cv_wait(&msp->ms_condensing_cv, &msp->ms_lock); + /* wait for a preceding trim to finish */ - while (msp->ms_trimming_ts != NULL) + while (msp->ms_trimming_ts != NULL && !vdev_trim_should_stop(vd)) cv_wait(&msp->ms_trim_cv, &msp->ms_lock); + + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + + /* + * If a management operation is about to happen, we need to stop + * pushing new trims into the pipeline. + */ + if (vdev_trim_should_stop(vd)) { + metaslab_free_trimset(msp->ms_prev_ts); + msp->ms_prev_ts = NULL; + zio = zio_null(NULL, spa, NULL, NULL, NULL, 0); + goto out; + } + msp->ms_trimming_ts = msp->ms_prev_ts; msp->ms_prev_ts = NULL; trim_tree = msp->ms_trimming_ts; @@ -4684,7 +4706,8 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) if (range_tree_space(trim_tree) == 0) { metaslab_free_trimset(msp->ms_trimming_ts); msp->ms_trimming_ts = NULL; - return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); + zio = zio_null(NULL, spa, NULL, NULL, NULL, 0); + goto out; } if (msp->ms_loaded) { @@ -4704,8 +4727,8 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) range_seg_t *rs; range_tree_t *sub_trim_tree = range_tree_create(NULL, NULL, &msp->ms_lock); - zio_t *pio = zio_null(NULL, spa, vd, metaslab_trim_done, msp, - 0); + + zio = zio_null(NULL, spa, vd, metaslab_trim_done, msp, 0); rs = avl_first(&trim_tree->rt_root); if (rs != NULL) @@ -4725,7 +4748,7 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) ASSERT3U(range_tree_space(sub_trim_tree), <=, max_bytes); if (range_tree_space(sub_trim_tree) == max_bytes) { - zio_nowait(zio_trim_tree(pio, spa, vd, + zio_nowait(zio_trim_tree(zio, spa, vd, sub_trim_tree, auto_trim, NULL, NULL, trim_flags, msp)); range_tree_vacate(sub_trim_tree, NULL, NULL); @@ -4733,17 +4756,20 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) start = end; } if (range_tree_space(sub_trim_tree) != 0) { - zio_nowait(zio_trim_tree(pio, spa, vd, sub_trim_tree, + zio_nowait(zio_trim_tree(zio, spa, vd, sub_trim_tree, auto_trim, NULL, NULL, trim_flags, msp)); range_tree_vacate(sub_trim_tree, NULL, NULL); } range_tree_destroy(sub_trim_tree); - - return (pio); } else { - return (zio_trim_tree(NULL, spa, vd, trim_tree, auto_trim, - metaslab_trim_done, msp, trim_flags, msp)); + zio = zio_trim_tree(NULL, spa, vd, trim_tree, auto_trim, + metaslab_trim_done, msp, trim_flags, msp); } + + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + +out: + return (zio); } /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0b99ba4c2e21..b240873bfaf2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8717,7 +8717,7 @@ static void spa_vdev_auto_trim_done(spa_t *spa) { mutex_enter(&spa->spa_auto_trim_lock); - ASSERT(spa->spa_num_auto_trimming > 0); + VERIFY(spa->spa_num_auto_trimming > 0); spa->spa_num_auto_trimming--; if (spa->spa_num_auto_trimming == 0) cv_broadcast(&spa->spa_auto_trim_done_cv); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 79343dced86f..cb90032b177f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4792,6 +4792,7 @@ vdev_man_trim(vdev_trim_info_t *vti) hrtime_t t = gethrtime(); spa_t *spa = vti->vti_vdev->vdev_spa; vdev_t *vd = vti->vti_vdev; + uint64_t ms_count; uint64_t i, cursor; boolean_t was_loaded = B_FALSE; @@ -4799,20 +4800,22 @@ vdev_man_trim(vdev_trim_info_t *vti) vd->vdev_trim_prog = 0; spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + ms_count = vd->vdev_ms_count; + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + ASSERT(vd->vdev_ms[0] != NULL); cursor = vd->vdev_ms[0]->ms_start; i = 0; - while (i < vti->vti_vdev->vdev_ms_count && !spa->spa_man_trim_stop) { + while (i < ms_count && !spa->spa_man_trim_stop) { uint64_t delta; metaslab_t *msp = vd->vdev_ms[i]; zio_t *trim_io; trim_io = metaslab_trim_all(msp, &cursor, &delta, &was_loaded); - spa_config_exit(spa, SCL_STATE_ALL, FTAG); if (trim_io != NULL) { ASSERT3U(cursor, >=, vd->vdev_ms[0]->ms_start); - vd->vdev_trim_prog = cursor - vd->vdev_ms[0]->ms_start; + vd->vdev_trim_prog += delta; (void) zio_wait(trim_io); } else { /* @@ -4854,17 +4857,8 @@ vdev_man_trim(vdev_trim_info_t *vti) break; } } - spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); } - spa_config_exit(spa, SCL_STATE_ALL, FTAG); out: - /* - * Ensure we're marked as "completed" even if we've had to stop - * before processing all metaslabs. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_trim_prog = vd->vdev_stat.vs_space; - mutex_exit(&vd->vdev_stat_lock); vd->vdev_man_trimming = B_FALSE; ASSERT(vti->vti_done_cb != NULL); @@ -4884,6 +4878,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) uint64_t txg = vti->vti_txg; uint64_t txgs_per_trim = zfs_txgs_per_trim; uint64_t mlim = 0, mused = 0; + uint64_t ms_count = vd->vdev_ms_count; boolean_t preserve_spilled; ASSERT3P(vd->vdev_top, ==, vd); @@ -4898,8 +4893,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) * autotrim run. But we only do so if the amount of memory consumed * by the extents doesn't exceed a threshold, otherwise we drop them. */ - spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); - for (uint64_t i = 0; i < vd->vdev_ms_count; i++) + for (uint64_t i = 0; i < ms_count; i++) mused += metaslab_trim_mem_used(vd->vdev_ms[i]); mlim = (physmem * PAGESIZE) / (zfs_trim_mem_lim_fact * spa->spa_root_vdev->vdev_children); @@ -4916,12 +4910,9 @@ vdev_auto_trim(vdev_trim_info_t *vti) * txgs_per_trim. Thus, for the default 200 metaslabs and 32 * txgs_per_trim, we'll only be trimming ~6.25 metaslabs per txg. */ - for (uint64_t i = txg % txgs_per_trim; i < vd->vdev_ms_count; - i += txgs_per_trim) + for (uint64_t i = txg % txgs_per_trim; i < ms_count; i += txgs_per_trim) metaslab_auto_trim(vd->vdev_ms[i], preserve_spilled); - spa_config_exit(spa, SCL_STATE_ALL, FTAG); - out: ASSERT(vti->vti_done_cb != NULL); vti->vti_done_cb(vti->vti_done_arg); @@ -4979,12 +4970,23 @@ vdev_trim_stop_wait(vdev_t *vd) trim_stop_set(vd, B_FALSE); } +/* + * Returns true if a management operation (such as attach/add) is trying to + * grab this vdev and therefore any ongoing trims should be canceled. + */ +boolean_t +vdev_trim_should_stop(vdev_t *vd) +{ + return (vd->vdev_trim_zios_stop); +} + #if defined(_KERNEL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); + /* BEGIN CSTYLED */ module_param(vdev_max_ms_count, int, 0644); MODULE_PARM_DESC(vdev_max_ms_count, From feae3c2d273f18e11c44414da08677ddca4b62eb Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Mon, 22 May 2017 09:21:08 +0200 Subject: [PATCH 14/38] Matt Ahrens' review comments, round 5. Requires-builders: none --- cmd/zpool/zpool_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index a901cc08c5ca..88f2783809f8 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6703,7 +6703,8 @@ zpool_do_resilver(int argc, char **argv) * zpool trim [-s|-r ] ... * * -s Stop. Stops any in-progress trim. - * -r Sets the TRIM rate. + * -r Sets the TRIM rate in bytes (per second). Supports + * adding a multiplier suffix such as 'k' or 'm'. */ int zpool_do_trim(int argc, char **argv) @@ -7077,11 +7078,11 @@ print_trim_status(uint64_t trim_prog, uint64_t total_size, uint64_t rate, char rate_str[32]; zfs_nicenum(rate, rate_str, sizeof (rate_str)); (void) printf(" trim: %.02f%%\tstarted: %s\t" - "(rate: %s/s)\n", MIN((((double)trim_prog) / + "(rate limit: %s/s)\n", MIN((((double)trim_prog) / total_size) * 100, 100), buf, rate_str); } else { (void) printf(" trim: %.02f%%\tstarted: %s\t" - "(rate: max)\n", MIN((((double)trim_prog) / + "(rate limit: none)\n", MIN((((double)trim_prog) / total_size) * 100, 100), buf); } } else { From 7d126637dbe8d7d22a9d09574b5621dadf1222db Mon Sep 17 00:00:00 2001 From: Saso Kiselkov Date: Mon, 22 May 2017 13:51:52 +0200 Subject: [PATCH 15/38] Deadlockiness in autotrim due to recent changes. Requires-builders: none --- module/zfs/metaslab.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5bfb6d2f1169..cbcdc4a59695 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4664,8 +4664,6 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) while (msp->ms_trimming_ts != NULL && !vdev_trim_should_stop(vd)) cv_wait(&msp->ms_trim_cv, &msp->ms_lock); - spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); - /* * If a management operation is about to happen, we need to stop * pushing new trims into the pipeline. @@ -4673,10 +4671,11 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) if (vdev_trim_should_stop(vd)) { metaslab_free_trimset(msp->ms_prev_ts); msp->ms_prev_ts = NULL; - zio = zio_null(NULL, spa, NULL, NULL, NULL, 0); - goto out; + return (zio_null(NULL, spa, NULL, NULL, NULL, 0)); } + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER); + msp->ms_trimming_ts = msp->ms_prev_ts; msp->ms_prev_ts = NULL; trim_tree = msp->ms_trimming_ts; @@ -4766,9 +4765,9 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) metaslab_trim_done, msp, trim_flags, msp); } +out: spa_config_exit(spa, SCL_STATE_ALL, FTAG); -out: return (zio); } From feb47a9a2aa6e8899686e1afde2c1860d030bbbb Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Mon, 10 Apr 2017 11:41:11 -0500 Subject: [PATCH 16/38] Want manual trim feature to skip never-allocated space Some storage backends such as large thinly-provisioned SANs are very slow for large trims. Manual trim now supports "zpool trim -p" (partial trim) to skip metaslabs for which there is no spacemap. Signed-off-by: Tim Chase Requires-builders: none --- cmd/zpool/zpool_main.c | 10 ++++++++-- include/libzfs.h | 4 +++- include/sys/fs/zfs.h | 1 + include/sys/spa.h | 2 +- include/sys/vdev.h | 1 + lib/libzfs/libzfs_pool.c | 6 ++++-- module/zfs/spa.c | 10 +++++++--- module/zfs/vdev.c | 29 +++++++++++++++++++++++++++-- module/zfs/zfs_ioctl.c | 2 +- 9 files changed, 53 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 88f2783809f8..2713bd60de41 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6589,6 +6589,7 @@ scrub_callback(zpool_handle_t *zhp, void *data) typedef struct trim_cbdata { boolean_t cb_start; uint64_t cb_rate; + boolean_t cb_fulltrim; } trim_cbdata_t; int @@ -6606,7 +6607,7 @@ trim_callback(zpool_handle_t *zhp, void *data) return (1); } - err = zpool_trim(zhp, cb->cb_start, cb->cb_rate); + err = zpool_trim(zhp, cb->cb_start, cb->cb_rate, cb->cb_fulltrim); return (err != 0); } @@ -6702,6 +6703,7 @@ zpool_do_resilver(int argc, char **argv) /* * zpool trim [-s|-r ] ... * + * -p Partial trim. Skips never-allocated space. * -s Stop. Stops any in-progress trim. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. @@ -6714,10 +6716,14 @@ zpool_do_trim(int argc, char **argv) cb.cb_start = B_TRUE; cb.cb_rate = 0; + cb.cb_fulltrim = B_TRUE; /* check options */ - while ((c = getopt(argc, argv, "sr:")) != -1) { + while ((c = getopt(argc, argv, "psr:")) != -1) { switch (c) { + case 'p': + cb.cb_fulltrim = B_FALSE; + break; case 's': cb.cb_start = B_FALSE; break; diff --git a/include/libzfs.h b/include/libzfs.h index 78b3bd3cbe09..b08bc021d0ff 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -258,7 +258,9 @@ typedef struct splitflags { extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); -extern int zpool_trim(zpool_handle_t *, boolean_t start, uint64_t rate); +extern int zpool_trim(zpool_handle_t *, boolean_t start, uint64_t rate, + boolean_t fulltrim); + extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen_one(zpool_handle_t *, void *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 6d01882352f6..e918a640d890 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -927,6 +927,7 @@ typedef struct pool_checkpoint_stat { typedef struct trim_cmd_info_s { uint64_t tci_start; /* B_TRUE = start; B_FALSE = stop */ uint64_t tci_rate; /* requested TRIM rate in bytes/sec */ + uint64_t tci_fulltrim; /* B_TRUE=trim never allocated space */ } trim_cmd_info_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index ddb19b1fadfc..a5288fe07d28 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -837,7 +837,7 @@ extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* trimming */ -extern void spa_man_trim(spa_t *spa, uint64_t rate); +extern void spa_man_trim(spa_t *spa, uint64_t rate, boolean_t fulltrim); extern void spa_man_trim_stop(spa_t *spa); extern void spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate, uint64_t *start_time, uint64_t *stop_time); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 87b1a95b0477..9e64affd2c6a 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -183,6 +183,7 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); extern void vdev_man_trim(vdev_trim_info_t *vti); +extern void vdev_man_trim_full(vdev_trim_info_t *vti); extern void vdev_auto_trim(vdev_trim_info_t *vti); extern void vdev_trim_stop_wait(vdev_t *vd); extern boolean_t vdev_trim_should_stop(vdev_t *vd); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 06dd399c4a24..bfe238778f24 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2259,12 +2259,14 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) * Trim the pool. */ int -zpool_trim(zpool_handle_t *zhp, boolean_t start, uint64_t rate) +zpool_trim(zpool_handle_t *zhp, boolean_t start, uint64_t rate, + boolean_t fulltrim) { zfs_cmd_t zc = {"\0"}; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; - trim_cmd_info_t tci = { .tci_start = start, .tci_rate = rate }; + trim_cmd_info_t tci = { .tci_start = start, .tci_rate = rate, + .tci_fulltrim = fulltrim }; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = (uintptr_t)&tci; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b240873bfaf2..cdd3cb95fd26 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8572,11 +8572,15 @@ spa_trim_update_time(spa_t *spa, uint64_t start_time, uint64_t stop_time) * space to the underlying vdevs. */ extern void -spa_man_trim(spa_t *spa, uint64_t rate) +spa_man_trim(spa_t *spa, uint64_t rate, boolean_t fulltrim) { - dmu_tx_t *time_update_tx; + void (*trimfunc)(void *); mutex_enter(&spa->spa_man_trim_lock); + if (fulltrim) + trimfunc = (void (*)(void *))vdev_man_trim_full; + else + trimfunc = (void (*)(void *))vdev_man_trim; if (rate != 0) spa->spa_man_trim_rate = MAX(rate, spa_min_trim_rate(spa)); @@ -8607,7 +8611,7 @@ spa_man_trim(spa_t *spa, uint64_t rate) vd->vdev_trim_prog = 0; (void) taskq_dispatch(spa->spa_man_trim_taskq, - (void (*)(void *))vdev_man_trim, vti, TQ_SLEEP); + trimfunc, vti, TQ_SLEEP); } spa_config_exit(spa, SCL_CONFIG, FTAG); spa_trim_update_time(spa, gethrestime_sec(), 0); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index cb90032b177f..8d879b4be958 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4785,9 +4785,11 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) * Implements the per-vdev portion of manual TRIM. The function passes over * all metaslabs on this vdev and performs a metaslab_trim_all on them. It's * also responsible for rate-control if spa_man_trim_rate is non-zero. + * + * If fulltrim is set, metaslabs without spacemaps are also trimmed. */ -void -vdev_man_trim(vdev_trim_info_t *vti) +static void +vdev_man_trim_impl(vdev_trim_info_t *vti, boolean_t fulltrim) { hrtime_t t = gethrtime(); spa_t *spa = vti->vti_vdev->vdev_spa; @@ -4811,6 +4813,17 @@ vdev_man_trim(vdev_trim_info_t *vti) metaslab_t *msp = vd->vdev_ms[i]; zio_t *trim_io; + if (msp->ms_sm == NULL && !fulltrim) { + /* + * If the space map has not been allocated and a + * partial trim was requested move on to the next one. + */ + i++; + if (i < vti->vti_vdev->vdev_ms_count) + cursor = vd->vdev_ms[i]->ms_start; + continue; + } + trim_io = metaslab_trim_all(msp, &cursor, &delta, &was_loaded); if (trim_io != NULL) { @@ -4867,6 +4880,18 @@ vdev_man_trim(vdev_trim_info_t *vti) kmem_free(vti, sizeof (*vti)); } +void +vdev_man_trim(vdev_trim_info_t *vti) +{ + vdev_man_trim_impl(vti, B_FALSE); +} + +void +vdev_man_trim_full(vdev_trim_info_t *vti) +{ + vdev_man_trim_impl(vti, B_TRUE); +} + /* * Runs through all metaslabs on the vdev and does their autotrim processing. */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 71b200da194c..7c291590a8d0 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1778,7 +1778,7 @@ zfs_ioc_pool_trim(zfs_cmd_t *zc) return (error); if (tci.tci_start) { - spa_man_trim(spa, tci.tci_rate); + spa_man_trim(spa, tci.tci_rate, tci.tci_fulltrim); } else { spa_man_trim_stop(spa); } From 0e381de523f6e4973f895b3012bfcd3ca8c40af1 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 22 May 2017 13:50:37 -0400 Subject: [PATCH 17/38] Update and add additional TRIM test cases The existing test cases were split in to multiple test cases and refactored. There are now test cases for the following: zpool_trim_001_pos - Verify manual TRIM zpool_trim_002_pos - Verify manual trim can be interrupted zpool_trim_003_pos - Verify 'zpool trim -s' rate limiting zpool_trim_004_pos - Verify 'zpool trim -p' partial TRIM works zpool_trim_005_neg - Verify bad parameters to 'zpool trim' zpool_trim_006_neg - Verify bad parameters to 'zpool trim -r' autotrim_001_pos - Verify 'autotrim=on' pool data integrity autotrim_002_pos - Verify various pool geometries manualtrim_001_pos - Verify manual trim pool data integrity manualtrim_002_pos - Verify various pool geometries manualtrim_003_pos - Verify 'zpool import|export' manualtrim_004_pos - Verify 'zpool online|offline|replace' manualtrim_005_pos - Verify TRIM and scrub run concurrently Signed-off-by: Brian Behlendorf Requires-builders: none --- configure.ac | 1 + tests/runfiles/linux.run | 8 +- tests/zfs-tests/include/libtest.shlib | 4 +- .../tests/functional/cli_root/Makefile.am | 1 + .../cli_root/zpool_trim/Makefile.am | 10 + .../cli_root/zpool_trim/cleanup.ksh | 36 +++ .../functional/cli_root/zpool_trim/setup.ksh | 40 +++ .../zpool_trim/zpool_trim_001_pos.ksh | 58 +++++ .../zpool_trim/zpool_trim_002_pos.ksh | 67 +++++ .../zpool_trim/zpool_trim_003_pos.ksh | 75 ++++++ .../zpool_trim/zpool_trim_004_pos.ksh | 61 +++++ .../zpool_trim/zpool_trim_005_neg.ksh | 52 ++++ .../zpool_trim/zpool_trim_006_neg.ksh | 52 ++++ .../tests/functional/trim/Makefile.am | 9 +- .../functional/trim/autotrim_001_pos.ksh | 113 +++------ .../functional/trim/autotrim_002_pos.ksh | 91 +++++++ .../tests/functional/trim/cleanup.ksh | 11 +- .../functional/trim/manualtrim_001_pos.ksh | 100 +++----- .../functional/trim/manualtrim_002_pos.ksh | 91 +++++++ .../functional/trim/manualtrim_003_pos.ksh | 74 ++++++ .../functional/trim/manualtrim_004_pos.ksh | 108 ++++++++ .../functional/trim/manualtrim_005_pos.ksh | 78 ++++++ .../zfs-tests/tests/functional/trim/setup.ksh | 16 +- .../zfs-tests/tests/functional/trim/trim.cfg | 73 +++--- .../tests/functional/trim/trim.kshlib | 231 ++++++++++++++++-- 25 files changed, 1260 insertions(+), 200 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/autotrim_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/manualtrim_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/manualtrim_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/manualtrim_005_pos.ksh diff --git a/configure.ac b/configure.ac index e773d35e89df..28b6419e5305 100644 --- a/configure.ac +++ b/configure.ac @@ -262,6 +262,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile tests/zfs-tests/tests/functional/cli_user/Makefile diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 72e6ea2f7a8b..45148a111341 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -469,6 +469,10 @@ tags = ['functional', 'cli_root', 'zpool_status'] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] +[tests/functional/cli_root/zpool_trim] +tests = ['zpool_trim_001_pos', 'zpool_trim_002_pos', 'zpool_trim_003_pos', + 'zpool_trim_004_pos', 'zpool_trim_005_neg', 'zpool_trim_006_neg'] + [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', @@ -837,7 +841,9 @@ tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos'] tags = ['functional', 'tmpfile'] [tests/functional/trim] -tests = ['autotrim_001_pos', 'manualtrim_001_pos'] +tests = ['autotrim_001_pos', 'autotrim_002_pos', 'manualtrim_001_pos', + 'manualtrim_002_pos', 'manualtrim_003_pos', 'manualtrim_004_pos', + 'manualtrim_005_pos'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 482ab5ef5101..33623710107c 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -23,10 +23,12 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # Copyright (c) 2012, 2017 by Delphix. All rights reserved. -# Copyright 2016 Nexenta Systems, Inc. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +# Use is subject to license terms. # . ${STF_TOOLS}/include/logapi.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/Makefile.am index 625cf8579f82..99f1257837c9 100644 --- a/tests/zfs-tests/tests/functional/cli_root/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/Makefile.am @@ -59,4 +59,5 @@ SUBDIRS = \ zpool_split \ zpool_status \ zpool_sync \ + zpool_trim \ zpool_upgrade diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am new file mode 100644 index 000000000000..07cad559fe2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am @@ -0,0 +1,10 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_trim +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + zpool_trim_001_pos.ksh \ + zpool_trim_002_pos.ksh \ + zpool_trim_003_pos.ksh \ + zpool_trim_004_pos.ksh \ + zpool_trim_005_neg.ksh \ + zpool_trim_006_neg.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh new file mode 100755 index 000000000000..79acb41b79aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +if [ -n "$HOST_POOL_NAME" ]; then + log_must zpool destroy "$HOST_POOL_NAME" +fi + +log_pass TRIM cleanup succeeded diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh new file mode 100755 index 000000000000..5399d7a1b8bb --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh @@ -0,0 +1,40 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +if [ -n "$HOST_POOL_NAME" ]; then + log_note "Creating TRIM host pool to control recordsize" + log_must zpool create -o cachefile=none -O recordsize=4k \ + -O mountpoint="$VDEV_DIR" "$HOST_POOL_NAME" "$HOST_POOL_DISK" +fi + +log_must rm -f $VDEVS + +log_pass TRIM setup succeeded diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_001_pos.ksh new file mode 100755 index 000000000000..efdaa2013df3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_001_pos.ksh @@ -0,0 +1,58 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify manual 'zpool trim'. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool. +# 5. Verify the completion status. + +verify_runnable "global" + +log_assert "Run 'zpool trim' to TRIM pool" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 16 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +do_trim $TRIMPOOL +log_must zpool destroy $TRIMPOOL + +log_pass "Manual TRIM successful" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_002_pos.ksh new file mode 100755 index 000000000000..9e9a891c4734 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_002_pos.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify manual 'zpool trim' can be interrupted. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool with rate limiting. +# 5. Verify the TRIM can be cancelled. + +verify_runnable "global" + +log_assert "Run 'zpool trim -s' to cancel manual TRIM" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 16 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +# Run trim at the minimal rate so it can be interrupted. +log_must zpool trim -r 1 $TRIMPOOL +log_must zpool trim -s $TRIMPOOL +sync_pool $TRIMPOOL + +typeset status=$(zpool status $TRIMPOOL | awk '/trim:/{print $2}') +[[ "$status" = "interrupted" ]] || log_fail "Manual TRIM was not interrupted" + +log_must zpool destroy $TRIMPOOL + +log_pass "Manual TRIM successfully cancelled" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_003_pos.ksh new file mode 100755 index 000000000000..9e6b140775f0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_003_pos.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify 'zpool trim -s' rate limiting. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool with rate limiting. +# 5. Verify the TRIM can be cancelled. + +verify_runnable "global" + +log_assert "Verify 'zpool trim -r' rate limiting" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 16 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +# Run 'zpool trim -r' multiple times to change the rate. +set -A args "1" "1K" "1M" "1G" +set -A expect "K/s" "K/s" "M/s" "G/s" +typeset -i i=0 +typeset rate +while [[ $i -lt ${#args[*]} ]]; do + log_must zpool trim -r ${args[i]} $TRIMPOOL + rate=$(zpool status $TRIMPOOL | tr '()' ' ' | awk '/trim:/ {print $11}') + if [ $(echo $rate | grep ${expect[i]}) ]; then + log_note "Reported rate $rate matches expected ${expect[i]}" + else + log_fail "Incorrect reported rate $rate expected ${expect[i]}" + fi + ((i = i + 1)) +done + +# Set the rate to unlimited and wait for completion. +do_trim $TRIMPOOL +log_must zpool destroy $TRIMPOOL + +log_pass "Manual TRIM rate can be modified" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_004_pos.ksh new file mode 100755 index 000000000000..6fc21e25c62a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_004_pos.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify 'zpool trim -p' partial trim. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Run 'zpool trim -p' to only TRIM allocated space maps. +# 3. Verify the vdevs are at least 90% of their original size. +# 4. Run 'zpool trim' to perform a full TRIM. +# 5. Verify the vdevs are less than 10% of their original size. + +verify_runnable "global" + +log_assert "Run 'zpool trim -p' to perform a partial TRIM" +log_onexit cleanup_trim + +log_must mkfile $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +typeset vdev_min_size=$(( floor(VDEV_SIZE * 0.10 / 1024 / 1024) )) +typeset vdev_max_size=$(( floor(VDEV_SIZE * 0.90 / 1024 / 1024) )) + +do_trim $TRIMPOOL "-p" +check_vdevs "-gt" "$vdev_max_size" + +do_trim $TRIMPOOL +check_vdevs "-lt" "$vdev_min_size" + +log_must zpool destroy $TRIMPOOL + +log_pass "Manual 'zpool trim -p' successfully TRIMmed pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_005_neg.ksh new file mode 100755 index 000000000000..87119564b090 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_005_neg.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# A badly formed parameter passed to 'zpool trim' should +# return an error. +# +# STRATEGY: +# 1. Create an array containing bad 'zpool trim' parameters. +# 2. For each element, execute the sub-command. +# 3. Verify it returns an error. +# + +verify_runnable "global" + +set -A args "1" "-a" "-?" "--%" "-123456" "0.5" "-o" "-b" "-b no" "-z 2" + +log_assert "Execute 'zpool trim' using invalid parameters." +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +typeset -i i=0 +while [[ $i -lt ${#args[*]} ]]; do + log_mustnot zpool trim ${args[i]} $TRIMPOOL + ((i = i + 1)) +done + +log_must zpool destroy $TRIMPOOL + +log_pass "Invalid parameters to 'zpool trim' fail as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_006_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_006_neg.ksh new file mode 100755 index 000000000000..ce52a1f78daf --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_006_neg.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# A badly formed parameter passed to 'zpool trim -r' should +# return an error. +# +# STRATEGY: +# 1. Create an array containing bad 'zpool trim -r' parameters. +# 2. For each element, execute the sub-command. +# 3. Verify it returns an error. +# + +verify_runnable "global" + +set -A args "a" "--%" "10X" "yes" "-?" "z 99" + +log_assert "Execute 'zpool trim -r' using invalid parameters." +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +typeset -i i=0 +while [[ $i -lt ${#args[*]} ]]; do + log_mustnot zpool trim -r ${args[i]} $TRIMPOOL + ((i = i + 1)) +done + +log_must zpool destroy $TRIMPOOL + +log_pass "Invalid parameters to 'zpool trim -r' fail as expected." diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am index a379bf898fd5..c08a2aba5bc8 100644 --- a/tests/zfs-tests/tests/functional/trim/Makefile.am +++ b/tests/zfs-tests/tests/functional/trim/Makefile.am @@ -1,8 +1,13 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/trim dist_pkgdata_SCRIPTS = \ setup.ksh \ + cleanup.ksh \ trim.cfg \ trim.kshlib \ - cleanup.ksh \ autotrim_001_pos.ksh \ - manualtrim_001_pos.ksh + autotrim_002_pos.ksh \ + manualtrim_001_pos.ksh \ + manualtrim_002_pos.ksh \ + manualtrim_003_pos.ksh \ + manualtrim_004_pos.ksh \ + manualtrim_005_pos.ksh diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh index 239ce86eb9cf..cd90528447ed 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_001_pos.ksh @@ -21,94 +21,55 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# -# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/trim/trim.cfg . $STF_SUITE/tests/functional/trim/trim.kshlib -set_tunable zfs_trim_min_ext_sz 4096 -set_tunable zfs_txgs_per_trim 2 - -function getsizemb -{ - typeset rval - - rval=$(du --block-size 1048576 -s "$1" | awk '{print $1}') - echo -n "$rval" -} +# +# DESCRIPTION: +# Verify 'autotrim=on' pool data integrity. +# +# STRATEGY: +# 1. Create a pool on the provided DISKS to TRIM. +# 2. Set 'autotrim=on' on pool. +# 3. Concurrently write randomly sized files to the pool, files are +# written with <=128K writes with an fsync after each write. +# 4. Remove files after being written, the random nature of the IO +# in intended to create a wide variety of TRIMable regions. +# 5. Create and destroy snapshots and clones to create TRIMable blocks. +# 6. Verify TRIM IOs of the expected type were issued for the pool. +# 7. Verify data integrity of the pool after TRIM. +# 8. Repeat for test for striped, mirrored, and RAIDZ pools. -function checkvdevs -{ - typeset vd sz +verify_runnable "global" - for vd in $VDEVS; do - sz=$(getsizemb $vd) - log_note Size of $vd is $sz MB - log_must test $sz -le $SHRUNK_SIZE_MB - done -} +if [ $(echo ${TRIM_DISKS} | nawk '{print NF}') -lt 2 ]; then + log_unsupported "Too few disks available (2 disk minimum)" +fi -function txgs -{ - typeset x +log_assert "Set 'autotrim=on' verify pool data integrity" +log_onexit cleanup_trim - # Run some txgs in order to let autotrim do its work. - # - for x in 1 2 3; do - log_must zfs snapshot $TRIMPOOL@snap - log_must zfs destroy $TRIMPOOL@snap - log_must zfs snapshot $TRIMPOOL@snap - log_must zfs destroy $TRIMPOOL@snap - done -} +# Minimum TRIM size is descreased to verity all TRIM sizes. +set_tunable64 zfs_trim_min_ext_sz 4096 -# -# Check various pool geometries: Create the pool, fill it, remove the test file, -# run some txgs, export the pool and verify that the vdevs shrunk. -# +# Reduced zfs_txgs_per_trim to make TRIMing more frequent. +set_tunable32 zfs_txgs_per_trim 2 -# -# raidz -# -for z in 1 2 3; do - setupvdevs - log_must zpool create -f $TRIMPOOL raidz$z $VDEVS +for type in "" "mirror" "raidz"; do + log_must zpool create -o cachefile=none -f $TRIMPOOL $type $TRIM_DISKS log_must zpool set autotrim=on $TRIMPOOL - log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w - log_must rm "/$TRIMPOOL/$TESTFILE" - txgs - log_must zpool export $TRIMPOOL - checkvdevs + write_remove + snap_clone + wait_trim_io $TRIMPOOL "auto" 10 + check_trim_io $TRIMPOOL "auto" + check_pool $TRIMPOOL + log_must zpool destroy $TRIMPOOL done -# -# mirror -# -setupvdevs -log_must zpool create -f $TRIMPOOL mirror $MIRROR_VDEVS_1 mirror $MIRROR_VDEVS_2 -log_must zpool set autotrim=on $TRIMPOOL -log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w -log_must rm "/$TRIMPOOL/$TESTFILE" -txgs -log_must zpool export $TRIMPOOL -checkvdevs - -# -# stripe -# -setupvdevs -log_must zpool create -f $TRIMPOOL $STRIPE_VDEVS -log_must zpool set autotrim=on $TRIMPOOL -log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w -log_must rm "/$TRIMPOOL/$TESTFILE" -txgs -log_must zpool export $TRIMPOOL -checkvdevs - -log_pass TRIM successfully shrunk vdevs +log_pass "Auto TRIM successfully scrubbed vdevs" diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_002_pos.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_002_pos.ksh new file mode 100755 index 000000000000..b2f22f330ca7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/autotrim_002_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Check various pool geometries (raidz[1-3], mirror, stripe) +# +# STRATEGY: +# 1. Create a pool on file vdevs to TRIM. +# 2. Set 'autotrim=on' on pool. +# 3. Fill the pool to a known percentage of capacity. +# 4. Verify the vdevs contain 25% or more allocated blocks. +# 5. Remove all files making the free blocks TRIMable. +# 6. Wait for autotrim to issue TRIM IOs for the free blocks. +# 4. Verify the vdevs contain 5% or less allocated blocks. +# 8. Repeat for test for striped, mirrored, and RAIDZ pools. + +verify_runnable "global" + +log_assert "Set 'autotrim=on' verify pool vdevs shrink" +log_onexit cleanup_trim + +# Minimum TRIM size is descreased to verity all TRIM sizes. +set_tunable64 zfs_trim_min_ext_sz 4096 + +# Reduced zfs_txgs_per_trim to make TRIMing more frequent. +set_tunable32 zfs_txgs_per_trim 2 + +typeset vdev_max_mb=$(( floor(VDEV_SIZE * 0.25 / 1024 / 1024) )) +typeset vdev_min_mb=$(( floor(VDEV_SIZE * 0.05 / 1024 / 1024) )) + +for type in "" "mirror" "raidz" "raidz2" "raidz3"; do + log_must truncate -s $VDEV_SIZE $VDEVS + log_must zpool create -o cachefile=none -f $TRIMPOOL $type $VDEVS + log_must zpool set autotrim=on $TRIMPOOL + + # Fill pool. Striped, mirrored, and raidz pools are filled to + # different capacities due to differences in the reserved space. + typeset availspace=$(get_prop available $TRIMPOOL) + if [[ "$type" = "mirror" ]]; then + typeset fill_mb=$(( floor(availspace * 0.65 / 1024 / 1024) )) + elif [[ "$type" = "" ]]; then + typeset fill_mb=$(( floor(availspace * 0.35 / 1024 / 1024) )) + else + typeset fill_mb=$(( floor(availspace * 0.40 / 1024 / 1024) )) + fi + + log_must file_write -o create -f /$TRIMPOOL/$TESTFILE \ + -b 1048576 -c $fill_mb -d R + log_must zpool sync + check_vdevs "-gt" "$vdev_max_mb" + + # Remove the file vdev usage should drop to less than 5%. + log_must rm /$TRIMPOOL/$TESTFILE + wait_trim_io $TRIMPOOL "auto" 10 + check_vdevs "-le" "$vdev_min_mb" + + log_must zpool destroy $TRIMPOOL + log_must rm -f $VDEVS +done + +log_pass "Auto TRIM successfully shrunk vdevs" diff --git a/tests/zfs-tests/tests/functional/trim/cleanup.ksh b/tests/zfs-tests/tests/functional/trim/cleanup.ksh index e8d1515e660a..79acb41b79aa 100755 --- a/tests/zfs-tests/tests/functional/trim/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/trim/cleanup.ksh @@ -21,11 +21,16 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib -rm -f $VDEVS +if [ -n "$HOST_POOL_NAME" ]; then + log_must zpool destroy "$HOST_POOL_NAME" +fi + +log_pass TRIM cleanup succeeded diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh index 7603a85cfd26..b79a71c9b1d0 100755 --- a/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_001_pos.ksh @@ -21,80 +21,54 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# -# Copyright (c) 2013, 2014 by Delphix. All rights reserved. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/trim/trim.cfg . $STF_SUITE/tests/functional/trim/trim.kshlib -set_tunable zfs_trim_min_ext_sz 4096 +# +# DESCRIPTION: +# Verify manual trim pool data integrity. +# +# STRATEGY: +# 1. Create a pool on the provided DISKS to TRIM. +# 2. Concurrently write randomly sized files to the pool, files are +# written with <=128K writes with an fsync after each write. +# 3. Remove files after being written, the random nature of the IO +# in intended to create a wide variety of TRIMable regions. +# 4. Create and destroy snapshots and clones to create TRIMable blocks. +# 5. Manually TRIM the pool. +# 6. Verify TRIM IOs of the expected type were issued for the pool. +# 7. Verify data integrity of the pool after TRIM. +# 8. Repeat for test for striped, mirrored, and RAIDZ pools. -function getsizemb -{ - typeset rval +verify_runnable "global" - rval=$(du --block-size 1048576 -s "$1" | sed -e 's;[ ].*;;') - echo -n "$rval" -} +if [ $(echo ${TRIM_DISKS} | nawk '{print NF}') -lt 2 ]; then + log_unsupported "Too few disks available (2 disk minimum)" +fi -function checkvdevs -{ - typeset vd sz +log_assert "Run 'zpool trim' verify pool data integrity" +log_onexit cleanup_trim - for vd in $VDEVS; do - sz=$(getsizemb $vd) - log_note Size of $vd is $sz MB - log_must test $sz -le $SHRUNK_SIZE_MB - done -} +# Minimum TRIM size is descreased to verity all TRIM sizes. +set_tunable64 zfs_trim_min_ext_sz 4096 -function dotrim -{ - log_must rm "/$TRIMPOOL/$TESTFILE" - log_must zpool export $TRIMPOOL - log_must zpool import -d $VDEVDIR $TRIMPOOL - log_must zpool trim $TRIMPOOL - sleep 5 - log_must zpool export $TRIMPOOL -} +# Reduced zfs_txgs_per_trim to make TRIMing more frequent. +set_tunable32 zfs_txgs_per_trim 2 -# -# Check various pool geometries: Create the pool, fill it, remove the test file, -# perform a manual trim, export the pool and verify that the vdevs shrunk. -# - -# -# raidz -# -for z in 1 2 3; do - setupvdevs - log_must zpool create -f $TRIMPOOL raidz$z $VDEVS - log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w - dotrim - checkvdevs +for type in "" "mirror" "raidz"; do + log_must zpool create -o cachefile=none -f $TRIMPOOL $type $TRIM_DISKS + write_remove + snap_clone + do_trim $TRIMPOOL + check_trim_io $TRIMPOOL "man" + check_pool $TRIMPOOL + log_must zpool destroy $TRIMPOOL done -# -# mirror -# -setupvdevs -log_must zpool create -f $TRIMPOOL mirror $MIRROR_VDEVS_1 mirror $MIRROR_VDEVS_2 -log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w -dotrim -checkvdevs - -# -# stripe -# -setupvdevs -log_must zpool create -f $TRIMPOOL $STRIPE_VDEVS -log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c $NUM_WRITES -d R -w -dotrim -checkvdevs - -log_pass Manual TRIM successfully shrunk vdevs +log_pass "Manual TRIM successfully scrubbed vdevs" diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_002_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_002_pos.ksh new file mode 100755 index 000000000000..4c04a71cf42c --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_002_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Check various pool geometries (raidz[1-3], mirror, stripe) +# +# STRATEGY: +# 1. Create a pool on file vdevs to TRIM. +# 2. Fill the pool to a known percentage of capacity. +# 3. Verify the vdevs contain 25% or more allocated blocks. +# 4. Remove all files making the free blocks TRIMable. +# 5. Manually TRIM the pool. +# 6. Wait for manual trim issue TRIM IOs for the free blocks. +# 4. Verify the vdevs contain 5% or less allocated blocks. +# 8. Repeat for test for striped, mirrored, and RAIDZ pools. + +verify_runnable "global" + +log_assert "Run 'zpool trim' verify pool vdevs shrink" +log_onexit cleanup_trim + +# Minimum TRIM size is descreased to verity all TRIM sizes. +set_tunable64 zfs_trim_min_ext_sz 4096 + +# Reduced zfs_txgs_per_trim to make TRIMing more frequent. +set_tunable32 zfs_txgs_per_trim 2 + +typeset vdev_max_mb=$(( floor(VDEV_SIZE * 0.25 / 1024 / 1024) )) +typeset vdev_min_mb=$(( floor(VDEV_SIZE * 0.05 / 1024 / 1024) )) + +for type in "" "mirror" "raidz" "raidz2" "raidz3"; do + log_must truncate -s $VDEV_SIZE $VDEVS + log_must zpool create -o cachefile=none -f $TRIMPOOL $type $VDEVS + + # Fill pool. Striped, mirrored, and raidz pools are filled to + # different capacities due to differences in the reserved space. + typeset availspace=$(get_prop available $TRIMPOOL) + if [[ "$type" = "mirror" ]]; then + typeset fill_mb=$(( floor(availspace * 0.65 / 1024 / 1024) )) + elif [[ "$type" = "" ]]; then + typeset fill_mb=$(( floor(availspace * 0.35 / 1024 / 1024) )) + else + typeset fill_mb=$(( floor(availspace * 0.40 / 1024 / 1024) )) + fi + + log_must file_write -o create -f /$TRIMPOOL/$TESTFILE \ + -b 1048576 -c $fill_mb -d R + log_must zpool sync + check_vdevs "-gt" "$vdev_max_mb" + + # Remove the file vdev usage should drop to less than 5%. + log_must rm /$TRIMPOOL/$TESTFILE + log_must zpool sync + do_trim $TRIMPOOL + check_vdevs "-le" "$vdev_min_mb" + + log_must zpool destroy $TRIMPOOL + log_must rm -f $VDEVS +done + +log_pass "Manual TRIM successfully shrunk vdevs" diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_003_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_003_pos.ksh new file mode 100755 index 000000000000..49b9e436cc6d --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_003_pos.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify 'zpool import|export' interrupts TRIM. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool. +# 5. Export then import the TRIMing pool. +# 6. Verify the manual TRIM was interrupted. +# 7. Verify the manual TRIM can be resumed and complete successfully. + +verify_runnable "global" + +log_assert "Verify 'zpool import|export' during TRIM resumes" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 16 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +log_must zpool trim -r 1 $TRIMPOOL +log_must zpool export $TRIMPOOL +log_must zpool import -d $VDEV_DIR $TRIMPOOL + +typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') +if [[ "$status" = "interrupted" ]]; then + log_note "Manual TRIM was interrupted" +else + log_fail "Manual TRIM was not interrupted, status is $status" +fi + +# Allow TRIM to be resumed at full rate and verify completion. +do_trim $TRIMPOOL +log_must zpool destroy $TRIMPOOL + +log_pass "Manual TRIM interrupted and resumed after import" diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh new file mode 100755 index 000000000000..7fb0edba2251 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh @@ -0,0 +1,108 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify 'zpool online|offline|replace' while TRIMming. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool. +# 5. Verify 'zpool online|offline|replace' interrupt the TRIM. +# 6. Verify the manual TRIM completes successfully. + +verify_runnable "global" + +log_assert "Verify 'zpool online|offline|replace' while TRIMming" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 1024 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +# Verify 'zpool offline' and 'zpool online'. +for vdev in $VDEVS; do + # Approximately 64M of TRIMable blocks set 1MB/s TRIM rate. + log_must zpool trim -r 1M $TRIMPOOL + + # Offline a vdev manual TRIM must continue. + log_must zpool offline $TRIMPOOL $vdev + typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') + if [[ "$status" != "interrupted" ]]; then + log_note "Manual TRIM is running as expected" + else + log_fail "Manual TRIM was unexpectedly interrupted" + fi + + # Online a vdev resilver stops manual TRIM. + log_must zpool online $TRIMPOOL $vdev + typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') + if [[ "$status" = "interrupted" ]]; then + log_note "Manual TRIM was interrupted as expected by resilver" + else + log_fail "Manual TRIM was not interrupted" + fi + + check_pool $TRIMPOOL +done + +# Verify 'zpool replace' by replacing each drive. +log_must truncate -s $VDEV_SIZE $VDEV_DIR/spare +for vdev in $VDEVS; do + # Approximately 64M of TRIMable blocks set 1MB/s TRIM rate. + log_must zpool trim -r 1M $TRIMPOOL + + log_must zpool replace $TRIMPOOL $vdev $VDEV_DIR/spare + typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') + if [[ "$status" = "interrupted" ]]; then + log_note "Manual TRIM was interrupted as expected by resilver" + else + log_fail "Manual TRIM was not interrupted" + fi + + check_pool $TRIMPOOL + log_must zpool replace $TRIMPOOL $VDEV_DIR/spare $vdev + check_pool $TRIMPOOL +done +log_must rm $VDEV_DIR/spare + +# Allow TRIM to be resumed at full rate and verify completion. +do_trim $TRIMPOOL +log_must zpool destroy $TRIMPOOL + +log_pass "Manual TRIM interrupted by 'zpool online|offline|replace' commands" diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_005_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_005_pos.ksh new file mode 100755 index 000000000000..875ff5d2fff1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_005_pos.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.cfg +. $STF_SUITE/tests/functional/trim/trim.kshlib + +# +# DESCRIPTION: +# Verify TRIM and scrub run concurrently. +# +# STRATEGY: +# 1. Create a pool on the provided VDEVS to TRIM. +# 2. Create a small file and sync the pool. +# 3. Remove the file and sync the pool. +# 4. Manually TRIM the pool. +# 5. Manually scrub the pool. +# 6. Verify TRIM and scrub both are reported by 'zpool status'. + +verify_runnable "global" + +log_assert "Verify TRIM and scrub run concurrently" +log_onexit cleanup_trim + +log_must truncate -s $VDEV_SIZE $VDEVS +log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS + +log_must file_write -o create -f "/$TRIMPOOL/$TESTFILE" -b $BLOCKSIZE -c 1024 -w +sync_pool $TRIMPOOL +log_must rm "/$TRIMPOOL/$TESTFILE" +sync_pool $TRIMPOOL + +log_must zpool trim -r 1M $TRIMPOOL +log_must zpool scrub $TRIMPOOL + +rate=$(zpool status $TRIMPOOL | tr '()' ' ' | awk '/trim:/ {print $11}') +if [[ "$rate" = "1M/s" ]]; then + log_note "Pool TRIMming at expected $rate rate" +else + log_fail "Pool is not TRIMming" +fi + +scrub=$(zpool status $TRIMPOOL | awk '/scan:/ { print $2,$3,$4 }') +if [[ "$scrub" = "scrub in progress" ]] || \ + [[ "$scrub" = "scrub repaired 0B" ]]; then + log_note "Pool scrubbing as expected" +else + log_fail "Pool is not scrubbing: $scrub" +fi + +log_must zpool destroy $TRIMPOOL + +log_pass "TRIM and scrub were able to run concurrently" diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh index feb9ef2ed7ea..5399d7a1b8bb 100755 --- a/tests/zfs-tests/tests/functional/trim/setup.ksh +++ b/tests/zfs-tests/tests/functional/trim/setup.ksh @@ -21,16 +21,20 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -# -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/trim/trim.cfg . $STF_SUITE/tests/functional/trim/trim.kshlib +if [ -n "$HOST_POOL_NAME" ]; then + log_note "Creating TRIM host pool to control recordsize" + log_must zpool create -o cachefile=none -O recordsize=4k \ + -O mountpoint="$VDEV_DIR" "$HOST_POOL_NAME" "$HOST_POOL_DISK" +fi + +log_must rm -f $VDEVS + log_pass TRIM setup succeeded diff --git a/tests/zfs-tests/tests/functional/trim/trim.cfg b/tests/zfs-tests/tests/functional/trim/trim.cfg index ab7e2291d074..a6afb112d4b2 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.cfg +++ b/tests/zfs-tests/tests/functional/trim/trim.cfg @@ -1,3 +1,5 @@ +#!/bin/ksh -p +# # # CDDL HEADER START # @@ -20,41 +22,46 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # -# -# Copyright (c) 2013 by Delphix. All rights reserved. -# +TRIMPOOL="trimpool.$$" +case "$(uname)" in +Linux) + export HOST_POOL_NAME='' + export HOST_POOL_DISK='' + export TRIM_DISKS="$DISKS" -# -# Parameters -# -TRIMPOOL=trimpool -VDEVDIR="/tmp" -VDEVS="/tmp/trim1.dev /tmp/trim2.dev /tmp/trim3.dev /tmp/trim4.dev /tmp/trim5.dev" -VDEV_SIZE=128m -TESTFILE=testfile -SHRUNK_SIZE_MB=20 + export VDEV_DIR="$TEST_BASE_DIR" + export VDEVS="$VDEV_DIR/trim1.dev $VDEV_DIR/trim2.dev \ + $VDEV_DIR/trim3.dev $VDEV_DIR/trim4.dev $VDEV_DIR/trim5.dev" + ;; +SunOS) + # On Illumos, we can't just shove the files into /tmp, because tmpfs + # doesn't support hole punching. UFS doesn't support it either. ZFS + # does, but it won't reduce space usage unless the amount of space + # freed covers at least a full host FS block (128k in most cases), + # which can mess with our space accouting. + # To work around these limitations, we simply use the first disk in + # $DISKS to hold a host pool with recordsize=4k, so we can guarantee + # file hole punching of a usable granularity for our needs. + export HOST_POOL_NAME="trimhost" + export HOST_POOL_DISK=$(echo "$DISKS" | awk '{print $1}') + export TRIM_DISKS="$(echo "$DISKS" | tr ' ' '\n' | grep -v '^$' | \ + tail +2 | tr '\n' ' ')" -NUM_WRITES=2048 -BLOCKSIZE=65536 + export VDEV_DIR="/$HOST_POOL_NAME" + export VDEVS="$VDEV_DIR/trim1.dev $VDEV_DIR/trim2.dev \ + $VDEV_DIR/trim3.dev $VDEV_DIR/trim4.dev $VDEV_DIR/trim5.dev" + ;; +esac -# -# Computed values and parameters -# -function get_mirror_vdevs -{ - set -- $VDEVS - MIRROR_VDEVS_1="$1 $2" - MIRROR_VDEVS_2="$3 $4" -} -get_mirror_vdevs - -function get_stripe_vdevs -{ - set -- $VDEVS - STRIPE_VDEVS="$1 $2 $3 $4" -} -get_stripe_vdevs +# These test limits are algorithm-sensitive, so whenever you adjust the +# way TRIM processes extents and filters them, be sure to adjust these +# accordingly to get all tests to pass. +export VDEV_SIZE=$MINVDEVSIZE +export TESTFILE=testfile +export MIN_TRIM_IOS=100 +export NUM_WRITES=2048 +export BLOCKSIZE=65536 diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib index d1b35f0aa46d..ef45c737caf2 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.kshlib +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -1,3 +1,4 @@ +#!/bin/ksh -p # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. @@ -9,28 +10,228 @@ # http://www.illumos.org/license/CDDL. # -function set_tunable +# +# Copyright (c) 2017 by Tim Chase. All rights reserved. +# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +# +# Get the actual on disk disk for the provided file. +# +function get_size_mb { - typeset tunable="$1" - typeset value="$2" - typeset zfs_tunables="/sys/module/zfs/parameters" + case "$(uname)" in + Linux) + typeset rval + rval=$(du --block-size 1048576 -s "$1" | awk '{print $1}') + echo -n "$rval" + ;; + SunOS) + du -m "$1" | awk '{print $1}' + ;; + esac +} - [[ -z "$tunable" ]] && return 1 - [[ -z "$value" ]] && return 1 - [[ -f "$zfs_tunables/$tunable" ]] || return 1 +# +# Get the number of auto|manual TRIM IOs issued for the pool. +# +function get_trim_io +{ + typeset pool="${1-:$TRIMPOOL}" + typeset type="${2-:auto}" + + case "$(uname)" in + Linux) + typeset rval - echo -n "$value" > "$zfs_tunables/$tunable" - return "$?" + # Sum the auto|man columns of the TRIM request size histogram. + case "$type" in + auto) + rval=$(zpool iostat -pr $pool | awk \ + '$1 ~ /[0-9].*/ { sum += $12 } END { print sum }') + echo -n "$rval" + ;; + man) + rval=$(zpool iostat -pr $pool | awk \ + '$1 ~ /[0-9].*/ { sum += $13 } END { print sum }') + echo -n "$rval" + ;; + esac + ;; + SunOS) + # 'zpool iostat -r' is not supported, this information may + # be available via another interface on Illumos. For the + # moment return $MIN_TRIM_IOS and assume TRIM IOs were issued. + echo -n "$MIN_TRIM_IOS" + ;; + esac } -function find_scsi_debug +# +# Generic cleanup function for TRIM test cases. +# +function cleanup_trim { - grep -H scsi_debug /sys/block/*/device/model | \ - awk -F/ '{print $4}' | tr '\n' ' ' + pkill -x file_write + if poolexists $TRIMPOOL; then + log_must destroy_pool $TRIMPOOL + fi + log_must rm -f $VDEVS + set_tunable64 zfs_trim_min_ext_sz 32 + set_tunable32 zfs_txgs_per_trim 32 } -function setupvdevs +# +# Check that TRIM IOs were send to devices in the pool. +# +function check_trim_io { - log_must rm -f $VDEVS - log_must truncate -s 192m $VDEVS + typeset pool="${1-:$TRIMPOOL}" + typeset type="$2" + typeset ios + + ios=$(get_trim_io $pool $type) + if [[ $ios -ge $MIN_TRIM_IOS ]]; then + log_note "Issued $ios $type TRIM IOs for pool $pool" + else + log_fail "Too few TRIM IOs issued $ios/$MIN_TRIM_IOS" + fi +} + +# +# Run N txgs which should be enough to TRIM the entire pool. +# +function wait_trim_io +{ + typeset pool="${1-:$TRIMPOOL}" + typeset type="${2-:auto}" + typeset txgs=${3:-10} + typeset timeout=30 + typeset stop_time=$(( $(date +%s) + $timeout )) + + typeset -i i=0 + while [[ $i -lt $txgs ]]; do + typeset ios=$(get_trim_io $pool $type) + if [ "$(date +%s)" -ge $stop_time ]; then + log_fail "Exceeded TRIM time limit of ${timeout}s" + return + fi + + log_note "Waiting for $type TRIM to complete ($i - $ios IOs)" + zpool sync -f + ((i = i + 1)) + done +} + +# +# Check that file vdevs against a taget value. +# +function check_vdevs +{ + typeset tgt_op=$1 + typeset tgt_size=$2 + typeset vdev + + for vdev in $VDEVS; do + typeset size=$(get_size_mb $vdev) + if test $size $tgt_op $tgt_size; then + log_note "Success $vdev is $size MB which is $tgt_op" \ + "than $tgt_size MB" + else + log_fail "Failure $vdev is $size MB which is not" \ + "$tgt_op than $tgt_size MB" + fi + done +} + +# +# Scrub the pool and verify it completed without errors. +# +function check_pool # pool +{ + typeset pool="${1-:$TRIMPOOL}" + + log_must zpool scrub $pool + while true; do + typeset st=$(zpool status $pool | awk '/scan:/ {print $3}') + if [[ "$st" == "repaired" ]] || [[ "$st" == "canceled" ]]; then + break + fi + log_note "Waiting for scrub to complete on $pool" + sleep 1 + done + + log_must zpool status -x $pool + log_must zpool clear $pool +} + +# +# Concurrently write files in randomly sized chunks fsync'ing every write +# then remove a fraction of them. This is intended to create TRIMable blocks. +# +function write_remove # destroy_files keep_files +{ + typeset destroy_files=${1:-3} + typeset keep_files=${2:-3} + + for i in $(seq $destroy_files); do + log_must eval "(file_write -o create \ + -f \"/$TRIMPOOL/$TESTFILE-destroy.$i\" \ + -b $(random $BLOCKSIZE) -c $(random $NUM_WRITES) -d R -w; \ + rm \"/$TRIMPOOL/$TESTFILE-destroy.$i\") &" + done + + for i in $(seq $keep_files); do + log_must eval "file_write -o create \ + -f \"/$TRIMPOOL/${TESTFILE}-keep.$i\" \ + -b $(random $BLOCKSIZE) -c $(random $NUM_WRITES) -d R -w &" + done + + wait +} + +# +# Perform administrative commands which will create TRIMable blocks. +# +function snap_clone # passes +{ + typeset passes=${1:-3} + + for i in $(seq $passes); do + log_must zfs snapshot $TRIMPOOL@snap + log_must zfs clone $TRIMPOOL@snap $TRIMPOOL/clone + log_must zfs destroy $TRIMPOOL/clone + log_must zfs destroy $TRIMPOOL@snap + done +} + +# +# Run manual trim for at most 30 seconds and verify the result. +# +function do_trim # pool options +{ + typeset pool="${1-:$TRIMPOOL}" + typeset options=$2 + typeset stop_time=$(( $(date +%s) + 30 )) + + log_must zpool trim $options $pool + + while true; do + typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') + if [ -z "$status" ]; then + log_fail "Pool reported '' TRIM status. Is TRIM" \ + "supported on this system?" + elif [[ "$status" = "completed" ]]; then + log_note "Pool completed TRIM successfully." + break + elif [[ "$status" = "interrupted" ]]; then + log_fail "TRIM interrupted it was expected to complete." + elif [ "$(date +%s)" -ge $stop_time ]; then + log_must zpool trim -s $pool + log_fail "Exceeded trim time limit of 30s, stopping." + else + sleep 1 + fi + done } From 46d31dec86d3d563f7eaebe88dd848e1647681fc Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 24 May 2017 19:01:23 -0400 Subject: [PATCH 18/38] Review feedback * Rename TRIM taskq threads to be more concise for Linux. * Fix divide by zero panic Signed-off-by: Brian Behlendorf Requires-builders: none --- module/zfs/spa_misc.c | 4 ++-- module/zfs/vdev.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index aa6192dd54fa..381cc1653f99 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2599,7 +2599,7 @@ spa_auto_trim_taskq_create(spa_t *spa) ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock)); ASSERT(spa->spa_auto_trim_taskq == NULL); - (void) snprintf(name, MAXPATHLEN, "%s_auto_trim", spa->spa_name); + (void) snprintf(name, MAXPATHLEN, "z_atrim_%s", spa->spa_name); spa->spa_auto_trim_taskq = taskq_create(name, zfs_auto_trim_taskq_batch_pct, minclsyspri, 1, INT_MAX, TASKQ_THREADS_CPU_PCT); @@ -2626,7 +2626,7 @@ spa_man_trim_taskq_create(spa_t *spa) */ return; } - (void) snprintf(name, MAXPATHLEN, "%s_man_trim", spa->spa_name); + (void) snprintf(name, MAXPATHLEN, "z_mtrim_%s", spa->spa_name); spa->spa_man_trim_taskq = taskq_create(name, spa->spa_root_vdev->vdev_children, minclsyspri, spa->spa_root_vdev->vdev_children, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8d879b4be958..ebf4010077e0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4901,7 +4901,7 @@ vdev_auto_trim(vdev_trim_info_t *vti) vdev_t *vd = vti->vti_vdev; spa_t *spa = vd->vdev_spa; uint64_t txg = vti->vti_txg; - uint64_t txgs_per_trim = zfs_txgs_per_trim; + uint64_t txgs_per_trim = MAX(zfs_txgs_per_trim, 1); uint64_t mlim = 0, mused = 0; uint64_t ms_count = vd->vdev_ms_count; boolean_t preserve_spilled; From d56cfce379071dc37df329e626b4e9720c3f59d5 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Wed, 24 May 2017 19:02:02 -0400 Subject: [PATCH 19/38] Remove vdev_raidz_map_alloc() Rather than hacking `vdev_raidz_map_alloc()` to get the child offsets calculate the values directly. Signed-off-by: Isaac Huang Requires-builders: none --- module/zfs/vdev_raidz.c | 134 ++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 73 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 11602a506a0d..a86d2b9da838 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -141,10 +141,6 @@ vdev_raidz_map_free(raidz_map_t *rm) { int c; - /* raidz_map_t without abd allocation from vdev_raidz_trim() */ - if (rm->rm_col[0].rc_abd == NULL) - goto out; - for (c = 0; c < rm->rm_firstdatacol; c++) { abd_free(rm->rm_col[c].rc_abd); @@ -158,7 +154,6 @@ vdev_raidz_map_free(raidz_map_t *rm) if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); -out: kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -447,9 +442,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = - abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, + off, rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; } } @@ -1631,38 +1625,6 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -/* - * Converts an allocated size on a raidz vdev back to a logical block - * size. This is used in trimming to figure out the appropriate logical - * size to pass to vdev_raidz_map_alloc when splitting up extents of free - * space obtained from metaslabs. However, a range of free space on a - * raidz vdev might have originally consisted of multiple blocks and - * those, taken together with their skip blocks, might not always align - * neatly to a new vdev_raidz_map_alloc covering the entire unified - * range. So to ensure that the newly allocated raidz map *always* fits - * within the asize passed to this function and never exceeds it (since - * that might trim allocated data past it), we round it down to the - * nearest suitable multiple of the vdev ashift (hence the "_floor" in - * this function's name). - */ -static uint64_t -vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize) -{ - uint64_t psize; - uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; - - psize = (asize - (nparity << ashift)); - psize /= cols; - psize *= cols - nparity; - psize += (1 << ashift) - 1; - - psize = P2ALIGN(psize, 1 << ashift); - - return (psize); -} - static void vdev_raidz_child_done(zio_t *zio) { @@ -2443,19 +2405,20 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) } static inline void -vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp, - const raidz_col_t *rc) +vdev_raidz_trim_append(dkioc_free_list_t *dfl, uint64_t *num_extsp, + uint64_t offset, uint64_t size) { uint64_t num_exts = *num_extsp; - ASSERT(rc->rc_size != 0); + + ASSERT(size != 0); if (dfl->dfl_num_exts > 0 && dfl->dfl_exts[num_exts - 1].dfle_start + - dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) { - dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size; + dfl->dfl_exts[num_exts - 1].dfle_length == offset) { + dfl->dfl_exts[num_exts - 1].dfle_length += size; } else { - dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset; - dfl->dfl_exts[num_exts].dfle_length = rc->rc_size; + dfl->dfl_exts[num_exts].dfle_start = offset; + dfl->dfl_exts[num_exts].dfle_length = size; (*num_extsp)++; } } @@ -2471,16 +2434,14 @@ static void vdev_raidz_trim(vdev_t *vd, zio_t *pio, dkioc_free_list_t *dfl, boolean_t auto_trim) { + const uint64_t children = vd->vdev_children; dkioc_free_list_t **sub_dfls; uint64_t *sub_dfls_num_exts; - zio_t *zio; - - sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children, - KM_SLEEP); - sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children, - KM_SLEEP); - zio = kmem_zalloc(sizeof (*zio), KM_SLEEP); - for (int i = 0; i < vd->vdev_children; i++) { + + sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * children, KM_SLEEP); + sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * children, KM_SLEEP); + + for (int i = 0; i < children; i++) { /* * We might over-allocate here, because the sub-lists can never * be longer than the parent list, but they can be shorter. @@ -2494,34 +2455,62 @@ vdev_raidz_trim(vdev_t *vd, zio_t *pio, dkioc_free_list_t *dfl, } /* - * Process all extents and redistribute them to the component vdevs - * according to a computed raidz map geometry. + * Process all extents and redistribute them to the component vdevs. + * + * 1. Calculate the number of child drives, i.e. cols, which may be + * smaller than vdev_children + * 2. For each child drive, calculate offset and size: + * a. 'offset' needs to be increased by 1 sector, when the drive + * wraps around to the next row, because the 1st drive does + * not necessarily begin at the 1st raidz child drive. + * b. 'size' needs to be increased by 1 sector, for the first + * remainder drives, because the extent doesn't always divide + * cleanly by cols, i.e. some drives may contribute more space + * to the extent. */ for (int i = 0; i < dfl->dfl_num_exts; i++) { uint64_t start = dfl->dfl_exts[i].dfle_start; uint64_t length = dfl->dfl_exts[i].dfle_length; - uint64_t j; - raidz_map_t *rm; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b = start >> ashift; + uint64_t s = length >> ashift; + /* The first column for this stripe. */ + uint64_t f = b % children; + uint64_t cols = (s < children) ? s : children; + uint64_t remainder = s % cols; + + ASSERT0(P2PHASE(start, 1ULL << ashift)); + ASSERT0(P2PHASE(length, 1ULL << ashift)); + + if (length <= vd->vdev_nparity << vd->vdev_top->vdev_ashift) + continue; - zio->io_offset = start; - zio->io_size = vdev_raidz_psize_floor(vd, length); - zio->io_abd = NULL; + for (int j = 0; j < cols; j++) { + uint64_t devidx = f + j; + uint64_t offset = b / children; + uint64_t size = s / cols; - rm = vdev_raidz_map_alloc(zio, vd->vdev_top->vdev_ashift, - vd->vdev_children, vd->vdev_nparity); + if (j < remainder) + size++; - for (j = 0; j < rm->rm_cols; j++) { - uint64_t devidx = rm->rm_col[j].rc_devidx; - vdev_raidz_trim_append_rc(sub_dfls[devidx], - &sub_dfls_num_exts[devidx], &rm->rm_col[j]); + if (devidx >= children) { + offset++; + devidx -= children; + } + + size <<= ashift; + offset <<= ashift; + vdev_raidz_trim_append(sub_dfls[devidx], + &sub_dfls_num_exts[devidx], offset, size); + length -= size; } - vdev_raidz_map_free(rm); + ASSERT0(length); } /* * Issue the component ioctls as children of the parent zio. */ - for (int i = 0; i < vd->vdev_children; i++) { + for (int i = 0; i < children; i++) { if (sub_dfls_num_exts[i] != 0) { vdev_t *child = vd->vdev_child[i]; zio_nowait(zio_trim_dfl(pio, child->vdev_spa, child, @@ -2530,9 +2519,8 @@ vdev_raidz_trim(vdev_t *vd, zio_t *pio, dkioc_free_list_t *dfl, dfl_free(sub_dfls[i]); } } - kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children); - kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children); - kmem_free(zio, sizeof (*zio)); + kmem_free(sub_dfls, sizeof (*sub_dfls) * children); + kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * children); } vdev_ops_t vdev_raidz_ops = { From ae1457d7dd76c2789ac1914c614fd3ce67e875a4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 May 2017 14:16:35 -0400 Subject: [PATCH 20/38] Review feedback 2 * Fixed missing taskq_destroy when exporting a pool which is being actively trimmed. * Add auto/manual TRIM coverage to ztest. * Temporarily disable manualtrim_004_pos. Signed-off-by: Brian Behlendorf Requires-builders: none --- cmd/ztest/ztest.c | 19 +++++++++++++++++++ module/zfs/spa.c | 9 +++++++++ .../functional/trim/manualtrim_004_pos.ksh | 3 +++ 3 files changed, 31 insertions(+) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index e080f7bb4419..c1855ab8221f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -377,6 +377,7 @@ ztest_func_t ztest_initialize; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; +ztest_func_t ztest_man_trim; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -430,6 +431,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), + ZTI_INIT(ztest_man_trim, 1, &zopt_sometimes), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -5532,6 +5534,21 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) } } +/* + * Start then stop a manual TRIM. + */ +void +ztest_man_trim(ztest_ds_t *zd, uint64_t id) +{ + uint64_t rate = 1 << ztest_random(30); + boolean_t fulltrim = (ztest_random(5) > 0); + spa_t *spa = ztest_spa; + + spa_man_trim(spa, rate, fulltrim); + (void) poll(NULL, 0, 100); /* wait a moment, then stop the TRIM. */ + spa_man_trim_stop(spa); +} + /* ARGSUSED */ void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) @@ -5567,6 +5584,8 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); + (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); + VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cdd3cb95fd26..7af8ce355958 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1478,6 +1478,15 @@ spa_unload(spa_t *spa) spa_auto_trim_taskq_destroy(spa); mutex_exit(&spa->spa_auto_trim_lock); + /* + * Destroy manual trim taskq if needed, this may be required if the + * async task was unable to run prior to being suspended. + */ + mutex_enter(&spa->spa_man_trim_lock); + if (spa->spa_man_trim_taskq) + spa_man_trim_taskq_destroy(spa); + mutex_exit(&spa->spa_man_trim_lock); + /* * Even though vdev_free() also calls vdev_metaslab_fini, we need * to call it earlier, before we wait for async i/o to complete. diff --git a/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh b/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh index 7fb0edba2251..02a55acdb337 100755 --- a/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/trim/manualtrim_004_pos.ksh @@ -47,6 +47,9 @@ verify_runnable "global" log_assert "Verify 'zpool online|offline|replace' while TRIMming" log_onexit cleanup_trim +# XXX - Disabled for automated testing only +log_unsupported "Skipping until issue is resolved" + log_must truncate -s $VDEV_SIZE $VDEVS log_must zpool create -o cachefile=none -f $TRIMPOOL raidz $VDEVS From 8b88b3e352905efde0d2fb5a45d60cf1aab34419 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 28 Mar 2018 12:07:49 -0700 Subject: [PATCH 21/38] Add trim manpage Signed-off-by: Chunwei Chen Requires-builders: none --- man/man5/zfs-module-parameters.5 | 55 +++++++++++++ man/man8/zpool.8 | 127 +++++++++++++++++++++++++++++-- 2 files changed, 175 insertions(+), 7 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index c9dfceb7eca6..b3dc5cc2f33b 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2306,6 +2306,49 @@ value of 75% will create a maximum of one thread per cpu. Default value: \fB75\fR%. .RE +.sp +.ne 2 +.na +\fBzfs_trim\fR (int) +.ad +.RS 12n +Controls whether the underlying vdevs of the pool are notified when +space is freed using the device-type-specific command set (TRIM here +being a general placeholder term rather than referring to just the SATA +TRIM command). This is frequently used on backing storage devices which +support thin provisioning or pre-erasure of blocks on flash media. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_min_ext_sz\fR (int) +.ad +.RS 12n +Minimum size region in bytes over which a device-specific TRIM command +will be sent to the underlying vdevs when \fBzfs_trim\fR is set. +.sp +Default value: \fB131072\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_sync\fR (int) +.ad +.RS 12n +Controls whether the underlying vdevs should issue TRIM commands synchronously +or asynchronously. When set for synchronous operation, extents to TRIM are +processed sequentially with each extent waiting for the last to complete. +In asynchronous mode TRIM commands for all provided extents are submitted +concurrently to the underlying vdev. The optimal strategy depends on how +the physical device handles TRIM commands. +.sp +Default value: \fB1\fR. +.RE + .sp .ne 2 .na @@ -2329,6 +2372,18 @@ Flush dirty data to disk at least every N seconds (maximum txg duration) Default value: \fB5\fR. .RE +.sp +.ne 2 +.na +\fBzfs_txgs_per_trim\fR (int) +.ad +.RS 12n +Number of transaction groups over which device-specific TRIM commands +are batched when \fBzfs_trim\fR is set. +.sp +Default value: \fB32\fR. +.RE + .sp .ne 2 .na diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 8f35ca0ee7ce..841fdf1d0d33 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -174,6 +174,11 @@ .Op Fl s | Fl p .Ar pool Ns ... .Nm +.Cm trim +.Op Fl p +.Op Fl r Ar rate | Fl s +.Ar pool Ns ... +.Nm .Cm set .Ar property Ns = Ns Ar value .Ar pool @@ -801,6 +806,41 @@ Any write requests that have yet to be committed to disk would be blocked. .It Sy panic Prints out a message to the console and generates a system crash dump. .El +.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off +When set to +.Sy on Ns , while deleting data, ZFS will inform the underlying vdevs of any +blocks that have been marked as freed. This allows thinly provisioned vdevs to +reclaim unused blocks. This feature is supported on file vdevs via hole +punching if it is supported by their underlying file system and on block +device vdevs if their underlying driver supports BLKDISCARD. The default +setting for this property is +.Sy off . +.Pp +Please note that automatic trimming of data blocks can put significant stress +on the underlying storage devices if they do not handle these commands in a +background, low-priority manner. In that case, it may be possible to achieve +most of the benefits of trimming free space on the pool by running an +on-demand (manual) trim every once in a while during a maintenance window +using the +.Nm zpool Cm trim +command. +.Pp +Automatic trim does not reclaim blocks after a delete immediately. Instead, +it waits approximately 2-4 minutes to allow for more efficient aggregation of +smaller portions of free space into fewer larger regions, as well as to allow +for longer pool corruption recovery via +.Nm zpool Cm import Fl F . +.It Sy forcetrim Ns = Ns Sy on Ns | Ns Sy off +Controls whether device support is taken into consideration when issuing +TRIM commands to the underlying vdevs of the pool. Normally, both automatic +trim and on-demand (manual) trim only issue TRIM commands if a vdev indicates +support for it. Setting the +.Sy forcetrim +property to +.Sy on +will force ZFS to issue TRIMs even if it thinks a device does not support it. +The default is +.Sy off . .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled The value of this property is the current state of .Ar feature_name . @@ -1744,15 +1784,20 @@ the path. This can be used in conjunction with the .Fl L flag. .It Fl r -Print request size histograms for the leaf ZIOs. This includes -histograms of individual ZIOs ( +Print request size histograms for the leaf vdev's IO. This includes +histograms of individual IOs ( .Ar ind ) -and aggregate ZIOs ( +and aggregate IOs ( .Ar agg ). -These stats can be useful for seeing how well the ZFS IO aggregator is -working. Do not confuse these request size stats with the block layer -requests; it's possible ZIOs can be broken up before being sent to the -block device. +TRIM IOs will not be aggregated and are split in to automatic ( +.Ar auto ) +and manual ( +.Ar man ). +TRIM requests which exceed 16M in size are counted as 16M requests. These +stats can be useful for seeing how well the ZFS IO aggregator is working. Do +not confuse these request size stats with the block layer requests; it's +possible these IOs will be broken up or merged before being sent to the block +device. .It Fl v Verbose statistics Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. @@ -1790,6 +1835,8 @@ Average amount of time IO spent in asynchronous priority queues. Does not include disk time. .Ar scrub : Average queuing time in scrub queue. Does not include disk time. +.Ar trim : +Average queuing time in trim queue. Does not include disk time. .It Fl q Include active queue statistics. Each priority queue has both pending ( @@ -1807,6 +1854,8 @@ queues. Current number of entries in asynchronous priority queues. .Ar scrubq_read : Current number of entries in scrub queue. +.Ar auto/man_trimq : +Current number of entries in automatic or manual trim queues. .Pp All queue statistics are instantaneous measurements of the number of entries in the queues. If you specify an interval, the measurements @@ -2110,6 +2159,70 @@ again. Starts a resilver. If an existing resilver is already running it will be restarted from the beginning. Any drives that were scheduled for a deferred resilver will be added to the new one. +.Cm trim +.Op Fl p +.Op Fl r Ar rate | Fl s +.Ar pool Ns ... +.Xc +Initiates an immediate on-demand TRIM operation on all of the free space of a +pool without delaying 2-4 minutes as it done for automatic trim. This informs +the underlying storage devices of all of the blocks that the pool no longer +considers allocated, thus allowing thinly provisioned storage devices to +reclaim them. +.Pp +Also note that an on-demand TRIM operation can be initiated irrespective of +the +.Sy autotrim +zpool property setting. It does, however, respect the +.Sy forcetrim +zpool property. +.Pp +An on-demand TRIM operation does not conflict with an ongoing scrub, but it +can put significant I/O stress on the underlying vdevs. A resilver, however, +automatically stops an on-demand TRIM operation. You can manually reinitiate +the TRIM operation after the resilver has started, by simply reissuing the +.Nm zpool Cm trim +command. +.Pp +Adding a vdev during TRIM is supported, although the progression display in +.Nm zpool Cm status +might not be entirely accurate in that case (TRIM will complete before +reaching 100%). Removing or detaching a vdev will prematurely terminate an +on-demand TRIM operation. +.Pp +See the documentation for the +.Sy autotrim +property above for a description of the vdevs on which +.Nm zpool Cm trim +is supported. +.Bl -tag -width Ds +.It Fl p +Causes a "partial" trim to be initiated in which space which has never been +allocated by ZFS is not trimmed. This option is useful for certain storage +backends such as large thinly-provisioned SANS on which large trim operations +are slow. +.El +.Bl -tag -width Ds +.It Fl r Ar rate +Controls the speed at which the TRIM operation progresses. Without this +option, TRIM is executed as quickly as possible. The rate, expressed in bytes +per second, is applied on a per-vdev basis; every top-level vdev in the pool +tries to match this speed. The requested rate is achieved by inserting delays +between each TRIMmed region. +.Pp +When an on-demand TRIM operation is already in progress, this option changes +its rate. To change a rate-limited TRIM to an unlimited one, simply execute +the +.Nm zpool Cm trim +command without a +.Fl r +option. +.El +.Bl -tag -width Ds +.It Fl s +Stop trimming. If an on-demand TRIM operation is not ongoing at the moment, +this does nothing and the command returns success. +.El .It Xo .Nm .Cm set From 977c20e047a0fa219579ecc84620a3ff4a74f5cd Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Apr 2018 11:54:40 -0700 Subject: [PATCH 22/38] Fix wrong logical operator Signed-off-by: Chunwei Chen Requires-builders: none --- module/zfs/vdev_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index f02899ec4755..cd35ada017c8 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -780,7 +780,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) vdev_queue_pending_add(vq, zio); /* trim I/Os have no single meaningful offset */ - if (zio->io_priority != ZIO_PRIORITY_AUTO_TRIM || + if (zio->io_priority != ZIO_PRIORITY_AUTO_TRIM && zio->io_priority != ZIO_PRIORITY_MAN_TRIM) vq->vq_last_offset = zio->io_offset + zio->io_size; From 3670596e8b1b50a6facb9d334836a40ba0d16df7 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Apr 2018 11:55:06 -0700 Subject: [PATCH 23/38] Wait for 1 sec before check trim status Signed-off-by: Chunwei Chen Requires-builders: none --- tests/zfs-tests/tests/functional/trim/trim.kshlib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib index ef45c737caf2..bb5f91c31047 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.kshlib +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -216,7 +216,7 @@ function do_trim # pool options typeset stop_time=$(( $(date +%s) + 30 )) log_must zpool trim $options $pool - + sleep 1 while true; do typeset status=$(zpool status $pool | awk '/trim:/ {print $2}') if [ -z "$status" ]; then From f19d6f696293e64e0b9d1b51069a9684b1174374 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Mon, 9 Jul 2018 08:18:30 -0500 Subject: [PATCH 24/38] Clean-ups following rebase to master Signed-off-by: Tim Chase Requires-builders: none --- module/zfs/metaslab.c | 30 ++++++++++++++---------------- module/zfs/range_tree.c | 1 - module/zfs/spa.c | 4 ++-- module/zfs/vdev_indirect.c | 21 ++++++++------------- 4 files changed, 24 insertions(+), 32 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cbcdc4a59695..8fcb87d3a217 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1283,7 +1283,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_size_tree); + rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); @@ -1436,7 +1436,7 @@ metaslab_load_impl(metaslab_t *msp) */ if (msp->ms_trimming_ts != NULL) { range_tree_walk(msp->ms_trimming_ts, range_tree_remove, - msp->ms_tree); + msp->ms_allocatable); } msp->ms_max_size = metaslab_block_maxsize(msp); } @@ -1528,7 +1528,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ASSERT(ms->ms_sm != NULL); } - ms->ms_cur_ts = range_tree_create(NULL, NULL, &ms->ms_lock); + ms->ms_cur_ts = range_tree_create(NULL, NULL); /* * We create the main range tree here, but we don't create the @@ -2367,7 +2367,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_tree, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); if (msp->ms_trimming_ts != NULL) space_map_write(sm, msp->ms_trimming_ts, SM_FREE, SM_NO_VDEVID, tx); @@ -2683,8 +2683,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) !vd->vdev_man_trimming) { range_tree_walk(*defer_tree, metaslab_trim_add, msp); if (!defer_allowed) { - range_tree_walk(msp->ms_freedtree, metaslab_trim_add, - msp); + range_tree_walk(msp->ms_freed, metaslab_trim_add, msp); } } range_tree_vacate(*defer_tree, @@ -4398,9 +4397,9 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, rsearch.rs_start = cur; rsearch.rs_end = cur + SPA_MINBLOCKSIZE; - rs = avl_find(&msp->ms_tree->rt_root, &rsearch, &where); + rs = avl_find(&msp->ms_allocatable->rt_root, &rsearch, &where); if (rs == NULL) { - rs = avl_nearest(&msp->ms_tree->rt_root, where, AVL_AFTER); + rs = avl_nearest(&msp->ms_allocatable->rt_root, where, AVL_AFTER); if (rs != NULL) cur = rs->rs_start; } @@ -4414,13 +4413,13 @@ metaslab_trim_all(metaslab_t *msp, uint64_t *cursor, uint64_t *delta, trimmed_space += (end - cur); cur = end; if (cur == rs->rs_end) - rs = AVL_NEXT(&msp->ms_tree->rt_root, rs); + rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs); } if (trimmed_space != 0) { /* Force this trim to take place ASAP. */ msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); + msp->ms_cur_ts = range_tree_create(NULL, NULL); trim_io = metaslab_exec_trim(msp, B_FALSE); ASSERT(trim_io != NULL); @@ -4548,7 +4547,7 @@ metaslab_auto_trim(metaslab_t *msp, boolean_t preserve_spilled) } } msp->ms_prev_ts = msp->ms_cur_ts; - msp->ms_cur_ts = range_tree_create(NULL, NULL, &msp->ms_lock); + msp->ms_cur_ts = range_tree_create(NULL, NULL); mutex_exit(&msp->ms_lock); } @@ -4609,7 +4608,7 @@ metaslab_trim_done(zio_t *zio) VERIFY(!msp->ms_condensing); if (msp->ms_loaded) { range_tree_walk(msp->ms_trimming_ts, range_tree_add, - msp->ms_tree); + msp->ms_allocatable); } metaslab_free_trimset(msp->ms_trimming_ts); msp->ms_trimming_ts = NULL; @@ -4684,7 +4683,7 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) for (range_seg_t *rs = avl_first(&trim_tree->rt_root); rs != NULL; rs = AVL_NEXT(&trim_tree->rt_root, rs)) { #ifdef DEBUG - if (!range_tree_contains_part(msp->ms_tree, + if (!range_tree_contains_part(msp->ms_allocatable, rs->rs_start, rs->rs_end - rs->rs_start)) { panic("trimming allocated region; rs=%p", (void*)rs); @@ -4696,7 +4695,7 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) * the tree of free space. They'll then be added back * in in metaslab_trim_done. */ - range_tree_remove(msp->ms_tree, rs->rs_start, + range_tree_remove(msp->ms_allocatable, rs->rs_start, rs->rs_end - rs->rs_start); } } @@ -4724,8 +4723,7 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) if (auto_trim) { uint64_t start = 0; range_seg_t *rs; - range_tree_t *sub_trim_tree = range_tree_create(NULL, NULL, - &msp->ms_lock); + range_tree_t *sub_trim_tree = range_tree_create(NULL, NULL); zio = zio_null(NULL, spa, vd, metaslab_trim_done, msp, 0); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 29827b750f92..60619b00dec8 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -516,7 +516,6 @@ range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { range_seg_t *rs; - ASSERT(MUTEX_HELD(rt->rt_lock)); rs = range_tree_find(rt, off, size); if (rs != NULL) panic("freeing free block; rs=%p", (void *)rs); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7af8ce355958..1164a381f0ee 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3549,9 +3549,9 @@ spa_ld_get_props(spa_t *spa) } (void) spa_dir_prop(spa, DMU_POOL_TRIM_START_TIME, - &spa->spa_man_trim_start_time); + &spa->spa_man_trim_start_time, B_FALSE); (void) spa_dir_prop(spa, DMU_POOL_TRIM_STOP_TIME, - &spa->spa_man_trim_stop_time); + &spa->spa_man_trim_stop_time, B_FALSE); /* * If we are importing a pool with missing top-level vdevs, diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 2f8268f0fab6..2e65c462110c 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1844,19 +1844,14 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { - vdev_indirect_open, - vdev_indirect_close, - vdev_default_asize, - vdev_indirect_io_start, - vdev_indirect_io_done, - NULL, - NULL, - NULL, - NULL, - vdev_indirect_remap, - NULL, - VDEV_TYPE_INDIRECT, /* name of this vdev type */ - B_FALSE /* leaf vdev */ + .vdev_op_open = vdev_indirect_open, + .vdev_op_close = vdev_indirect_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_indirect_io_start, + .vdev_op_io_done = vdev_indirect_io_done, + .vdev_op_remap = vdev_indirect_remap, + .vdev_op_type = VDEV_TYPE_INDIRECT, + .vdev_op_leaf = B_FALSE }; #if defined(_KERNEL) From ec4e8943eb7eb4e16fdf404483d42dca4b7c2f95 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Wed, 26 Dec 2018 14:48:07 -0600 Subject: [PATCH 25/38] ZIO_PIPELINE_CONTINUE fix Requires-builders: none --- module/zfs/zio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0b283f62b86b..30822a896e30 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3927,7 +3927,7 @@ zio_vdev_io_start(zio_t *zio) } if (ZIO_IS_TRIM(zio) && zio_trim_should_bypass(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); vd->vdev_ops->vdev_op_io_start(zio); return (NULL); From 0b8cc399626c0eaa7e1c20b70d70180e9d78436f Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Wed, 26 Dec 2018 16:43:02 -0600 Subject: [PATCH 26/38] More fixups Requires-builders: none --- module/zfs/vdev_label.c | 8 -------- module/zfs/vdev_queue.c | 2 ++ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 64d2e37bdc46..be921ea7d888 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -376,14 +376,6 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); - fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_AUTO_TRIM_HISTO, - vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM], - ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM])); - - fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_MAN_TRIM_HISTO, - vsx->vsx_agg_histo[ZIO_PRIORITY_AUTO_TRIM], - ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_MAN_TRIM])); - /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index cd35ada017c8..f768d3996d4a 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -451,6 +451,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; spa_history_kstat_t *shk = &spa->spa_stats.io_history; + avl_tree_t *qtt; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); @@ -470,6 +471,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; spa_history_kstat_t *shk = &spa->spa_stats.io_history; + avl_tree_t *qtt; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); From 26fdfdb31d48b88e034ef1dbc6c372ce0c89091d Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 30 Dec 2018 09:24:15 -0600 Subject: [PATCH 27/38] Add tags to trim test cases NOTE: should be squashed into a previous commit Requires-builders: none --- tests/runfiles/linux.run | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 45148a111341..f9e90f6cefcb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -472,6 +472,7 @@ tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_001_pos', 'zpool_trim_002_pos', 'zpool_trim_003_pos', 'zpool_trim_004_pos', 'zpool_trim_005_neg', 'zpool_trim_006_neg'] +tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', From 3c5eb9c9037238bfa2923124b22182a3035667f8 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Tue, 1 Jan 2019 20:10:31 -0600 Subject: [PATCH 28/38] Preserve activation flags when sorting metaslabs Also, comment-out the ASSERT(!metaslab_should_allocate(msp, asize)); in metaslab_group_alloc_normal(). It seems that the additional metaslab_group_sort() performed by trim makes this assertion invalid. Requires-builders: none --- module/zfs/metaslab.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 8fcb87d3a217..5e578e5128ca 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -3301,7 +3301,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + /* ASSERT(!metaslab_should_allocate(msp, asize)); XXX */ mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -4717,7 +4717,8 @@ metaslab_exec_trim(metaslab_t *msp, boolean_t auto_trim) * don't know which buckets to alter with what we have in * trim_tree. */ - metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp)); + metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp) | + (msp->ms_weight & METASLAB_ACTIVE_MASK)); } if (auto_trim) { From 014e259172f6d3613d5292edfdbbdf76ab805955 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Wed, 2 Jan 2019 11:02:25 -0600 Subject: [PATCH 29/38] Trim should skip removed devices Requires-builders: none --- module/zfs/spa.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1164a381f0ee..4c9f95b8c8f6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8522,8 +8522,11 @@ spa_auto_trim(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_auto_trim_lock); for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *cvd = spa->spa_root_vdev->vdev_child[i]; + if (!vdev_is_concrete(cvd)) + continue; vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP); - vti->vti_vdev = spa->spa_root_vdev->vdev_child[i]; + vti->vti_vdev = cvd; vti->vti_txg = txg; vti->vti_done_cb = (void (*)(void *))spa_vdev_auto_trim_done; vti->vti_done_arg = spa; @@ -8612,6 +8615,8 @@ spa_man_trim(spa_t *spa, uint64_t rate, boolean_t fulltrim) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + if (!vdev_is_concrete(vd)) + continue; vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP); vti->vti_vdev = vd; vti->vti_done_cb = (void (*)(void *))spa_vdev_man_trim_done; @@ -8690,7 +8695,9 @@ spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate, mutex_enter(&spa->spa_man_trim_lock); if (spa->spa_num_man_trimming > 0) { for (uint64_t i = 0; i < root_vd->vdev_children; i++) { - total += root_vd->vdev_child[i]->vdev_trim_prog; + vdev_t *cvd = root_vd->vdev_child[i]; + if (vdev_is_concrete(cvd)) + total += cvd->vdev_trim_prog; } } *prog = total; @@ -8757,8 +8764,10 @@ spa_min_trim_rate(spa_t *spa) /* find the smallest metaslab */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { - smallest_ms_sz = MIN(smallest_ms_sz, - spa->spa_root_vdev->vdev_child[i]->vdev_ms[0]->ms_size); + vdev_t *cvd = spa->spa_root_vdev->vdev_child[i]; + if (vdev_is_concrete(cvd)) + smallest_ms_sz = MIN(smallest_ms_sz, + cvd->vdev_ms[0]->ms_size); } spa_config_exit(spa, SCL_CONFIG, FTAG); VERIFY(smallest_ms_sz != 0); From af4a108f71d0b945b1b5f9b5de7a7c489dead002 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Mon, 7 Jan 2019 14:08:26 -0600 Subject: [PATCH 30/38] Don't dereference null vdev_ms Requires-builders: none --- module/zfs/spa.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 4c9f95b8c8f6..b6a2c9b590cd 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8615,7 +8615,8 @@ spa_man_trim(spa_t *spa, uint64_t rate, boolean_t fulltrim) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; - if (!vdev_is_concrete(vd)) + if (!vdev_is_concrete(vd) || vd->vdev_ms == NULL || + vd->vdev_ms[0] == NULL) continue; vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP); vti->vti_vdev = vd; @@ -8765,9 +8766,10 @@ spa_min_trim_rate(spa_t *spa) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *cvd = spa->spa_root_vdev->vdev_child[i]; - if (vdev_is_concrete(cvd)) - smallest_ms_sz = MIN(smallest_ms_sz, - cvd->vdev_ms[0]->ms_size); + if (!vdev_is_concrete(cvd) || cvd->vdev_ms == NULL || + cvd->vdev_ms[0] == NULL) + continue; + smallest_ms_sz = MIN(smallest_ms_sz, cvd->vdev_ms[0]->ms_size); } spa_config_exit(spa, SCL_CONFIG, FTAG); VERIFY(smallest_ms_sz != 0); From c641ee6e822626efeca48bf4e327c5334f695591 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sat, 12 Jan 2019 08:43:05 -0600 Subject: [PATCH 31/38] Account for non-concrete vdevs in spa_num_auto_trimming Also, stop trimming in places where initialization is stopped: spa_vdev_remove_log() and spa_vdev_remove_top(). Requires-builders: none --- module/zfs/spa.c | 8 +++++++- module/zfs/vdev_removal.c | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b6a2c9b590cd..63754b56f18e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8518,7 +8518,13 @@ spa_auto_trim(spa_t *spa, uint64_t txg) */ if (!mutex_tryenter(&spa->spa_auto_trim_lock)) return; - spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children; + + /* Count the number of auto trim threads which will be launched below */ + /* spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children; */ + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + if (vdev_is_concrete(spa->spa_root_vdev->vdev_child[i])) + ++spa->spa_num_auto_trimming; + } mutex_exit(&spa->spa_auto_trim_lock); for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 8d89007872f8..7afe38109e5f 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1904,6 +1904,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) /* Stop initializing */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + /* Stop trim */ + vdev_trim_stop_wait(vd); + *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, @@ -2087,6 +2090,9 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + /* Stop trim */ + vdev_trim_stop_wait(vd); + *txg = spa_vdev_config_enter(spa); /* From f8e5760d10f0a56251099fe1eeb6f3a80e18c7d4 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 13 Jan 2019 22:20:33 -0600 Subject: [PATCH 32/38] Use proper tag for spa config refcounts NOTE: This should be its own separate PR. It was discovered during debugging of the TRIM work. Requires-builders: none --- module/zfs/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 6fd5d3e9a4f3..746ee0f77fb4 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -421,7 +421,7 @@ mmp_write_uberblock(spa_t *spa) mmp->mmp_kstat_id++, error); } mutex_exit(&mmp->mmp_io_lock); - spa_config_exit(spa, SCL_STATE, FTAG); + spa_config_exit(spa, SCL_STATE, mmp_tag); return; } From c18f20a81746c71fbe5735c27496253fefb3645b Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 13 Jan 2019 22:21:18 -0600 Subject: [PATCH 33/38] Re-instate tracked spa config refcounts Requires-builders: none --- module/zfs/spa_misc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 381cc1653f99..a327a133f482 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -456,7 +456,11 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); +#ifdef ZFS_DEBUG + zfs_refcount_create_tracked(&scl->scl_count); +#else zfs_refcount_create_untracked(&scl->scl_count); +#endif scl->scl_writer = NULL; scl->scl_write_wanted = 0; } From 562bb92278b9bc933f1dfa15be6adb36a97f61c4 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Tue, 15 Jan 2019 22:55:05 -0600 Subject: [PATCH 34/38] Skip non-concrete vdevs... In trim_stop_set() and trim_stop_wait(). Requires-builders: none --- module/zfs/vdev.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ebf4010077e0..9a05ea2a392a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4952,8 +4952,11 @@ trim_stop_set(vdev_t *vd, boolean_t flag) vd->vdev_trim_zios_stop = flag; mutex_exit(&vd->vdev_trim_zios_lock); - for (uint64_t i = 0; i < vd->vdev_children; i++) - trim_stop_set(vd->vdev_child[i], flag); + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (vdev_is_concrete(cvd)) + trim_stop_set(cvd, flag); + } } static void @@ -4964,8 +4967,11 @@ trim_stop_wait(vdev_t *vd) cv_wait(&vd->vdev_trim_zios_cv, &vd->vdev_trim_zios_lock); mutex_exit(&vd->vdev_trim_zios_lock); - for (uint64_t i = 0; i < vd->vdev_children; i++) - trim_stop_wait(vd->vdev_child[i]); + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (vdev_is_concrete(cvd)) + trim_stop_wait(vd->vdev_child[i]); + } } /* From 7806538a306a4e6d9fb88fd52f54d45229620567 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Fri, 18 Jan 2019 10:51:32 -0600 Subject: [PATCH 35/38] Re-work management of the auto trim taskq... ... and related locking Don't take the spa_auto_trim_lock in the sync path: The auto trim taskq is now stopped and started in response to changes in the "autotrim" property asynchronously by the spa async task. If the spa_auto-trim_lock is taken in the sync task, the system can deadlock as follows: Non-sync context task: Acquires spa_auto_trim_lock via spa_vdev_enter() or some other path. Sync task: syncing a change to the "autotrim" property attempts to take spa_auto_trim_lock and blocks. Non-sync context task: blocks attempting to take a spa config lock which won't be released until the sync task completes. Deadlock. Also, since the auto trim taskq is now started asynchronously, it may not yet be ready yet when the sync task calls spa_auto_trim(). Modified spa_auto_trim so it will simply return if the taskq is not available yet (it will do its thing on the next pass). Also, avoid starting auto trim taskqs for the "import$" pseudo-pool. Also, don't attempt to create the taskq if a spa unload is in-progress. Requires-builders: none --- include/sys/spa.h | 2 ++ module/zfs/spa.c | 42 +++++++++++++++++++++++++++++------------- module/zfs/spa_misc.c | 4 ++++ 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index a5288fe07d28..068a47ab9745 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -796,6 +796,8 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY 0x200 +#define SPA_ASYNC_AUTO_TRIM_TASKQ_CREATE 0x400 +#define SPA_ASYNC_AUTO_TRIM_TASKQ_DESTROY 0x800 /* * Controls the behavior of spa_vdev_remove(). diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 63754b56f18e..562db1a7264f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3541,7 +3541,8 @@ spa_ld_get_props(spa_t *spa) mutex_enter(&spa->spa_auto_trim_lock); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_auto_trim); - if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON && + strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) spa_auto_trim_taskq_create(spa); mutex_exit(&spa->spa_auto_trim_lock); @@ -5306,7 +5307,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, mutex_enter(&spa->spa_auto_trim_lock); spa->spa_auto_trim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); - if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON && + strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) spa_auto_trim_taskq_create(spa); mutex_exit(&spa->spa_auto_trim_lock); @@ -7250,11 +7252,27 @@ spa_async_thread(void *arg) mutex_exit(&spa_namespace_lock); } + /* + * Trim taskq management. + */ if (tasks & SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY) { mutex_enter(&spa->spa_man_trim_lock); spa_man_trim_taskq_destroy(spa); mutex_exit(&spa->spa_man_trim_lock); } + if (tasks & SPA_ASYNC_AUTO_TRIM_TASKQ_CREATE) { + mutex_enter(&spa->spa_auto_trim_lock); + if (spa->spa_auto_trim_taskq == NULL) + spa_auto_trim_taskq_create(spa); + mutex_exit(&spa->spa_auto_trim_lock); + } + if (tasks & SPA_ASYNC_AUTO_TRIM_TASKQ_DESTROY) { + mutex_enter(&spa->spa_auto_trim_lock); + if (spa->spa_auto_trim_taskq != NULL) + spa_auto_trim_taskq_destroy(spa); + mutex_exit(&spa->spa_auto_trim_lock); + } + /* * Let the world know that we're done. @@ -7768,16 +7786,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_force_trim = intval; break; case ZPOOL_PROP_AUTOTRIM: - mutex_enter(&spa->spa_auto_trim_lock); - if (intval != spa->spa_auto_trim) { - spa->spa_auto_trim = intval; - if (intval != 0) - spa_auto_trim_taskq_create(spa); - else - spa_auto_trim_taskq_destroy( - spa); - } - mutex_exit(&spa->spa_auto_trim_lock); + spa_async_request(spa, intval ? + SPA_ASYNC_AUTO_TRIM_TASKQ_CREATE : + SPA_ASYNC_AUTO_TRIM_TASKQ_DESTROY); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; @@ -8509,7 +8520,6 @@ spa_auto_trim(spa_t *spa, uint64_t txg) { ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER) == SCL_CONFIG); ASSERT(!MUTEX_HELD(&spa->spa_auto_trim_lock)); - ASSERT(spa->spa_auto_trim_taskq != NULL); /* * Another pool management task might be currently prevented from @@ -8519,6 +8529,12 @@ spa_auto_trim(spa_t *spa, uint64_t txg) if (!mutex_tryenter(&spa->spa_auto_trim_lock)) return; + /* Async start-up of the auto trim taskq may not yet have completed */ + if (spa->spa_auto_trim_taskq == NULL) { + mutex_exit(&spa->spa_auto_trim_lock); + return; + } + /* Count the number of auto trim threads which will be launched below */ /* spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children; */ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a327a133f482..1184094688bf 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2601,6 +2601,10 @@ spa_auto_trim_taskq_create(spa_t *spa) { char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + /* Don't create the taskq if the pool is unloading */ + if (spa->spa_sync_on == B_FALSE) + return; + ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock)); ASSERT(spa->spa_auto_trim_taskq == NULL); (void) snprintf(name, MAXPATHLEN, "z_atrim_%s", spa->spa_name); From e38d791debc5ab3dfbd07f86966dc096c8f507fb Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Fri, 18 Jan 2019 15:16:21 -0600 Subject: [PATCH 36/38] Move the auto trim taskq start-up on import The change in the previous commit to not attempt to create the taskq if a spa unload is in-progress caused the auto trim taskq to not be started upon import. Requires-builders: none --- module/zfs/spa.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 562db1a7264f..0317cdecd063 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3538,14 +3538,7 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa_prop_find(spa, ZPOOL_PROP_FORCETRIM, &spa->spa_force_trim); - - mutex_enter(&spa->spa_auto_trim_lock); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_auto_trim); - if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON && - strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) - spa_auto_trim_taskq_create(spa); - mutex_exit(&spa->spa_auto_trim_lock); - spa->spa_autoreplace = (autoreplace != 0); } @@ -4249,6 +4242,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); + /* + * Start the auto trim taskq if autotrim is enabled. + */ + mutex_enter(&spa->spa_auto_trim_lock); + if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON && + strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) + spa_auto_trim_taskq_create(spa); + mutex_exit(&spa->spa_auto_trim_lock); + + /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks From 8d4a1187bc3494b87c8a61528f813d88e481dacf Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Fri, 18 Jan 2019 16:48:52 -0600 Subject: [PATCH 37/38] Set spa->spa_auto_trim on pool creation Rather than using the default value for the property, we need to use the value which might be set by "zpool create -o autotrim=". Requires-builders: none --- module/zfs/spa.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0317cdecd063..b24979b372a0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5307,19 +5307,19 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_force_trim = zpool_prop_default_numeric(ZPOOL_PROP_FORCETRIM); - - mutex_enter(&spa->spa_auto_trim_lock); spa->spa_auto_trim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); - if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON && - strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) - spa_auto_trim_taskq_create(spa); - mutex_exit(&spa->spa_auto_trim_lock); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } + /* Handle "zpool create -o autotrim=on" */ + uint64_t auto_trim; + if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_AUTOTRIM), + &auto_trim) == 0) + spa->spa_auto_trim = auto_trim; + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -8169,7 +8169,6 @@ spa_sync(spa_t *spa, uint64_t txg) if (spa->spa_auto_trim == SPA_AUTO_TRIM_ON) spa_auto_trim(spa, txg); - /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. From 3a184b81911825810aab16022b4c8c8c2e1e9a3d Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Mon, 21 Jan 2019 08:09:26 -0600 Subject: [PATCH 38/38] Only call spa_man_trim_taskq_destroy() when needed Long zloop runs were occasionally hitting the "ASSERT(spa->spa_man_trim_taskq != NULL)" in spa_man_trim_taskq_destroy(). It's not clear to me how this was happening because the only place "spa->spa_man_trim_taskq" is cleared is in spa_man_trim_taskq_destroy() itself which is only called from the (single) spa_async_thread() or from spa_unload(). To that end, this commit adds a non-NULL check in spa_async_thread() analagous to the tests which were added when support for stopping/starting the auto trim taskqs were added to spa_async_thread(). NOTE: Yes, that means I consider this to be a band-aid. --- module/zfs/spa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b24979b372a0..47d30cc25ddd 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7260,7 +7260,8 @@ spa_async_thread(void *arg) */ if (tasks & SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY) { mutex_enter(&spa->spa_man_trim_lock); - spa_man_trim_taskq_destroy(spa); + if (spa->spa_man_trim_taskq != NULL) + spa_man_trim_taskq_destroy(spa); mutex_exit(&spa->spa_man_trim_lock); } if (tasks & SPA_ASYNC_AUTO_TRIM_TASKQ_CREATE) {