diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 57170c8ae717..dc71dc5e7912 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1164,6 +1164,23 @@ zpool_do_add(int argc, char **argv) } } + /* + * Special case: + * + * We need to know the special_failsafe pool property value to determine + * if the new vdev configuration has the correct redundancy requirements + * for special and dedup vdevs. + * + * Pass in the current value for special_failsafe to the proplist. + */ + char strval[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval, + ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) { + verify(add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval, + &props, B_TRUE) == 0); + } + /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, !check_inuse, check_replication, B_FALSE, dryrun, argc, argv); @@ -6940,6 +6957,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) } } + /* + * Special case: + * + * We need to know the special_failsafe pool property value to determine + * if the new vdev configuration has the correct redundancy requirements + * for special and dedup vdevs. + * + * Pass in the current value for special_failsafe to the proplist. + */ + char strval[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval, + ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) { + verify(add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval, + &props, B_TRUE) == 0); + } + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index fbd4b81dfacc..b2fe2ec77fc2 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -85,6 +85,7 @@ */ boolean_t error_seen; boolean_t is_force; +boolean_t is_alloc_class; void vdev_error(const char *fmt, ...) @@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...) if (!error_seen) { (void) fprintf(stderr, gettext("invalid vdev specification\n")); if (!is_force) - (void) fprintf(stderr, gettext("use '-f' to override " - "the following errors:\n")); + if (is_alloc_class) { + (void) fprintf(stderr, gettext("Turn on the " + "special_failsafe pool property or use '-f'" + " to override the following errors:\n")); + is_alloc_class = B_FALSE; + } else { + (void) fprintf(stderr, gettext("use '-f' to " + "override the following errors:\n")); + } else (void) fprintf(stderr, gettext("the following errors " "must be manually repaired:\n")); @@ -442,6 +450,7 @@ typedef struct replication_level { const char *zprl_type; uint64_t zprl_children; uint64_t zprl_parity; + boolean_t zprl_is_alloc_class; } replication_level_t; #define ZPOOL_FUZZ (16 * 1024 * 1024) @@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b) return (B_FALSE); } +/* + * Return true if 'props' contains: + * + * special_failsafe=on + * + * ... and feature@special_failsafe is NOT disabled. + */ +static boolean_t +is_special_failsafe_enabled_in_props(nvlist_t *props) +{ + const char *str = NULL; + + if (nvlist_lookup_string(props, "feature@special_failsafe", + &str) == 0) { + if ((str != NULL) && strcmp(str, "disabled") == 0) { + return (B_FALSE); + } + } + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), + &str) == 0) { + if ((str != NULL) && strcmp(str, "on") == 0) { + return (B_TRUE); /* It is enabled */ + } + } + + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. */ static replication_level_t * -get_replication(nvlist_t *nvroot, boolean_t fatal) +get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal) { nvlist_t **top; uint_t t, toplevels; @@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) nvlist_t *nv; const char *type; replication_level_t lastrep = {0}; - replication_level_t rep; + replication_level_t rep = {0}; replication_level_t *ret; replication_level_t *raidz, *mirror; boolean_t dontreport; @@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) for (t = 0; t < toplevels; t++) { uint64_t is_log = B_FALSE; + const char *str = NULL; nv = top[t]; @@ -528,12 +568,32 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; + rep.zprl_type = type; + + /* + * If special_failsafe=on then we know the special allocation + * class devices have at least one copy of their data on the + * pool so we can ignore their replication level. + */ + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &str); + if (str && + ((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) || + (strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) { + rep.zprl_is_alloc_class = B_TRUE; + is_alloc_class = B_TRUE; + if (is_special_failsafe_enabled_in_props(props)) { + continue; /* We're backed up, skip redundancy */ + } + } else { + is_alloc_class = B_FALSE; + } + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { /* * This is a 'file' or 'disk' vdev. */ - rep.zprl_type = type; rep.zprl_children = 1; rep.zprl_parity = 0; } else { @@ -548,7 +608,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * We also check that the size of each vdev (if it can * be determined) is the same. */ - rep.zprl_type = type; rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || @@ -808,7 +867,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * report any difference between the two. */ static int -check_replication(nvlist_t *config, nvlist_t *newroot) +check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot) { nvlist_t **child; uint_t children; @@ -825,7 +884,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if ((current = get_replication(nvroot, B_FALSE)) == NULL) + if ((current = get_replication(props, nvroot, B_FALSE)) == NULL) return (0); } /* @@ -850,17 +909,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot) * Get the replication level of the new vdev spec, reporting any * inconsistencies found. */ - if ((new = get_replication(newroot, B_TRUE)) == NULL) { + if ((new = get_replication(props, newroot, B_TRUE)) == NULL) { free(current); return (-1); } - /* * Check to see if the new vdev spec matches the replication level of * the current pool. */ ret = 0; if (current != NULL) { + if (current->zprl_is_alloc_class || new->zprl_is_alloc_class) + is_alloc_class = B_TRUE; + else + is_alloc_class = B_FALSE; + + /* + * Special case: + * If there were any redundancy problems with alloc class vdevs + * BUT the pool had special_failsafe on, then we're fine since + * all the alloc class data has a copy in the main pool. + */ + if (is_special_failsafe_enabled_in_props(props) && + is_alloc_class) + goto out; + if (is_raidz_mirror(current, new, &raidz, &mirror) || is_raidz_mirror(new, current, &raidz, &mirror)) { if (raidz->zprl_parity != mirror->zprl_children - 1) { @@ -899,7 +972,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot) ret = -1; } } - +out: free(new); if (current != NULL) free(current); @@ -1888,7 +1961,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, * found. We include the existing pool spec, if any, as we need to * catch changes against the existing replication level. */ - if (check_rep && check_replication(poolconfig, newroot) != 0) { + if (check_rep && check_replication(props, poolconfig, newroot) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e191420f2d2d..d83d5defa7ee 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -258,6 +258,7 @@ typedef enum { ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, + ZPOOL_PROP_SPECIAL_FAILSAFE, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -1610,6 +1611,7 @@ typedef enum { ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, + ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE, } zfs_errno_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index 3073c4d1b937..8d02dc8d5dac 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1117,7 +1117,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); -extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing, + uint64_t missing_special); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5605a35b8641..e5e61baeee10 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -336,6 +336,13 @@ struct spa { uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ + /* + * Number of 'spa_missing_tvds' that are alloc class devices + * in the pool that has special_failsafe on, and are thus recoverable + * from errors. + */ + uint64_t spa_missing_recovered_tvds; + uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; @@ -474,6 +481,9 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ + + /* Backup special/dedup devices data to the pool */ + boolean_t spa_special_failsafe; }; extern char *spa_config_path; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 57ff31e89eb9..47fc643d9c53 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -640,6 +640,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); +extern boolean_t vdev_is_leaf(vdev_t *vd); +extern boolean_t vdev_is_special(vdev_t *vd); +extern boolean_t vdev_is_dedup(vdev_t *vd); +extern boolean_t vdev_is_alloc_class(vdev_t *vd); +extern boolean_t vdev_is_special_failsafe(vdev_t *vd); /* * Vdev ashift optimization tunables diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2515ba321759..be74255b31c6 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_SPECIAL_FAILSAFE, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 80f4b7439a55..10ec8c7eda12 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -607,7 +607,7 @@ - + @@ -2921,7 +2921,8 @@ - + + @@ -5963,7 +5964,8 @@ - + + @@ -9025,8 +9027,8 @@ - - + + @@ -9103,7 +9105,7 @@ - + diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 73ae0950ccb6..378de5a6f8ee 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -774,6 +774,15 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_ASHIFT_MISMATCH: zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); break; + case ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Cannot set pool prop special_failsafe=on since " + "feature@special_failsafe is not set to 'enabled'.\n" + "This could be because the special_failsafe pool prop was " + "manually turned off while the special_failsafe feature " + "flag was active, or the feature flag was disabled.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 06705ff4d9b4..6e349920d21f 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1924,7 +1924,7 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, /* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ static boolean_t -vdev_is_leaf(nvlist_t *nv) +vdev_is_leaf_nv(nvlist_t *nv) { uint_t children = 0; nvlist_t **child; @@ -1937,10 +1937,10 @@ vdev_is_leaf(nvlist_t *nv) /* Return if a vdev is a leaf vdev and a real device (disk or file) */ static boolean_t -vdev_is_real_leaf(nvlist_t *nv) +vdev_is_real_leaf_nv(nvlist_t *nv) { const char *type = NULL; - if (!vdev_is_leaf(nv)) + if (!vdev_is_leaf_nv(nv)) return (B_FALSE); (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); @@ -1973,7 +1973,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, /* The very first entry in the NV list is a special case */ if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { - if (real_leaves_only && !vdev_is_real_leaf(nv)) + if (real_leaves_only && !vdev_is_real_leaf_nv(nv)) return (0); *((nvlist_t **)last_nv) = nv; @@ -1996,7 +1996,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, * we want. */ if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { - if (real_leaves_only && !vdev_is_real_leaf(nv)) + if (real_leaves_only && !vdev_is_real_leaf_nv(nv)) return (0); *((nvlist_t **)last_nv) = nv; diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ea3c68dc6083..9316f7983336 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -322,6 +322,40 @@ With device removal, it can be returned to the .Sy enabled state if all the dedicated allocation class vdevs are removed. . +.feature org.zfsonlinux special_failsafe yes allocation_classes +This feature allows the +.Sy special_failsafe +pool property to be used. +When the +.Sy special_failsafe +pool property is set to "on" all proceeding writes to allocation class vdevs +(like special and dedup vdevs) will also generate an additional copy of the data +to be written to the pool. +This allows alloc class vdev data to be "backed up" to the pool. +A fully backed up allocation device vdev can fail without causing the pool to be +suspended, even if the alloc class device is not redundant. +.Pp +It is important to note the difference between the +.Sy special_failsafe +feature flag and a +.Sy special_failsafe +pool property since they appear similar. +The +.Sy special_failsafe +feature flag is a safeguard to prevent a pool that is using special_failsafe +from being imported read/write on an older version of ZFS that does not support +special_failsafe (and possibly compromising the integrity of the backup +guarantees). +The pool property is what actually allows you to turn on/off the backup copy +writes. +The +.Sy special_failsafe +feature will switch from "enabled" to "active" when allocation class devices +are added. +See the +.Sy special_failsafe +pool property for more details. +. .feature com.delphix async_destroy yes Destroying a file system requires traversing all of its data in order to return its used space to the pool. diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 18dfca6dc8ac..5e6b2c0e0db4 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -181,14 +181,18 @@ section. .It Sy dedup A device solely dedicated for deduplication tables. The redundancy of this device should match the redundancy of the other normal -devices in the pool. +devices in the pool except if the +.Sy special_failsafe +pool property is enabled. If more than one dedup device is specified, then allocations are load-balanced between those devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file blocks. The redundancy of this device should match the redundancy of the other normal -devices in the pool. +devices in the pool except if the +.Sy special_failsafe +pool property is enabled. If more than one special device is specified, then allocations are load-balanced between those devices. .Pp diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 5428ab8d3076..129f8de52731 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -437,6 +437,34 @@ command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. +.It Sy special_failsafe Ns = Ns Sy on Ns | Ns Sy off +Controls the special failsafe subsystem for special allocation +class vdevs. +When it's turned on, all writes to special allocation class vdevs +(like 'special' and 'dedup' vdevs) will also write an additional copy of the +data to the main pool. +This allows alloc class vdev data to be "backed up" to the pool. +When +.Sy special_failsafe +is turned on, alloc class vdevs can fail regardless of their redundancy level +without the pool loosing data. +To use +.Sy special_failsafe +simply turn it on at zpool create time, or turn it on prior to adding +alloc class devices. +It's important to note that after alloc class vdevs are added to the pool with +.Sy special_failsafe +on, you can still turn +.Sy special_failsafe +off again, but once it's off you can't turn it back on. +.Sy special_failsafe +can be freely toggled on/off if alloc class devices haven't been added to the +pool, since the pool prop would have no effect. +The +.Sy feature@special_failsafe +feature flag must be enabled in order to use the +.Sy special_failsafe +pool property. .El . .Ss User Properties diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 309d9bf14cd4..a3583faa8195 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -753,6 +753,18 @@ zpool_feature_init(void) "org.openzfs:raidz_expansion", "raidz_expansion", "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t special_failsafe_deps[] = { + SPA_FEATURE_ALLOCATION_CLASSES, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_SPECIAL_FAILSAFE, + "org.openzfs:special_failsafe", "special_failsafe", + "Save a copy of allocation class device data to main pool", + ZFEATURE_FLAG_MOS, + ZFEATURE_TYPE_BOOLEAN, special_failsafe_deps, + sfeatures); + } zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e2e3bf5be69e..e767c0e3193e 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -153,6 +153,10 @@ zpool_prop_init(void) zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_SPECIAL_FAILSAFE, + "special_failsafe", 0, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "SPECIAL_FAILSAFE", boolean_table, + sfeatures); /* default index properties */ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 7170b5eefcea..fa73f6c5da4f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5848,10 +5848,22 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; + boolean_t is_special_failsafe = B_FALSE; + + if ((spa->spa_special_failsafe && ((mc == spa_special_class(spa)) || + (mc == spa_dedup_class(spa))))) { + is_special_failsafe = B_TRUE; + } ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + /* + * Earlier layers of the code should set nvdas > 1 if the + * alloc class vdev is being backed up. + */ + ASSERT(!(is_special_failsafe && ndvas == 1)); + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { @@ -5866,7 +5878,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + metaslab_class_t *_mc; + if (is_special_failsafe && (d == 1)) { + /* + * If we have the special_failsafe prop set, then make + * the 2nd copy of the data we are going to write go to + * the regular pool rather than yet another copy to the + * alloc class device. That way, if the special device + * is lost, there's still a backup in the pool. + */ + _mc = spa_normal_class(spa); + } else { + _mc = mc; + } + + error = metaslab_alloc_dva(spa, _mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 638572996c3a..698e102f52f4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -477,6 +477,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) DNODE_MIN_SIZE, ZPROP_SRC_NONE); } + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) { + zprop_source_t src; + if ((uint64_t)spa->spa_special_failsafe == + zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + + spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE, + NULL, spa->spa_special_failsafe, src); + } else { + /* special_failsafe not used */ + spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE, + NULL, B_FALSE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -610,6 +626,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; + boolean_t special_failsafe_prop = B_FALSE; + + /* + * The way the feature flags work here are a little interesting. + * + * At zpool creation time, this feature will not be initialized yet when + * spa_prop_validate() gets called. This works out though, as the + * feature flag will be passed in the nvlist if the feature is enabled. + * + * After the pool is created, calls to this function (like zpool set) + * will not include the feature flag in the props nvlist, but the + * feature table will be initialized, so we can use + * spa_feature_is_active(). + */ + boolean_t special_failsafe_feature_disabled; + special_failsafe_feature_disabled = !(spa_feature_is_enabled(spa, + SPA_FEATURE_SPECIAL_FAILSAFE) || spa_feature_is_active(spa, + SPA_FEATURE_SPECIAL_FAILSAFE)); + + /* Did they explicitly pass feature@special_failsafe=enabled ? */ + boolean_t special_failsafe_feature_passed = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { @@ -617,6 +654,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); + spa_feature_t fid = 0; switch (prop) { case ZPOOL_PROP_INVAL: @@ -651,11 +689,30 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } fname = strchr(propname, '@') + 1; - if (zfeature_lookup_name(fname, NULL) != 0) { + if (zfeature_lookup_name(fname, &fid) != 0) { error = SET_ERROR(EINVAL); break; } - + /* + * Special case - If both: + * + * SPA_FEATURE_SPECIAL_FAILSAFE = disabled + * + * ... and ... + * + * ZPOOL_PROP_SPECIAL_FAILSAFE = on + * + * then we need to fail. Note that the presence + * of SPA_FEATURE_SPECIAL_FAILSAFE in the + * nvlist means it is enabled (although its + * intval will be 0). If it's disabled, then + * SPA_FEATURE_SPECIAL_FAILSAFE will not + * be in the nvlist at all. + */ + if (fid == SPA_FEATURE_SPECIAL_FAILSAFE) { + special_failsafe_feature_passed = + B_TRUE; + } has_feature = B_TRUE; } else { error = SET_ERROR(EINVAL); @@ -799,6 +856,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; + case ZPOOL_PROP_SPECIAL_FAILSAFE: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + if (intval == 1) + special_failsafe_prop = B_TRUE; + break; default: break; @@ -811,6 +875,26 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); + if (special_failsafe_prop && special_failsafe_feature_disabled && + !special_failsafe_feature_passed) { + /* + * We can't enable SPECIAL_FAILSAFE pool prop if the + * feature flag SPA_FEATURE_SPECIAL_FAILSAFE is + * disabled. + */ + error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE); + } + + /* + * If the user wants to turn on the special_failsafe prop, but it + * was turned off (while the feature was active), then it can't be + * turned on again. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE) && + !spa->spa_special_failsafe && special_failsafe_prop) { + error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE); + } + if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); @@ -2475,6 +2559,53 @@ spa_check_removed(vdev_t *vd) } } +/* + * Decide what to do if we have missing/corrupted alloc class devices. + * + * If we have missing top-level vdevs and they are all alloc class devices with + * special_failsafe set, then we may still be able to import the pool. + */ +static int +spa_check_for_bad_alloc_class_devices(spa_t *spa) +{ + if (spa->spa_missing_recovered_tvds == 0) + return (0); + + /* + * Are there missing alloc class devices but + * SPA_FEATURE_SPECIAL_FAILSAFE is not enabled? If so, + * then we can't import. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) { + spa_load_note(spa, "some alloc class devices are missing, " + "cannot import."); + return (SET_ERROR(ENXIO)); + } + + /* + * If all the missing top-level devices are alloc class devices, and + * if they have all their data backed up to the pool, then we can still + * import the pool. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_missing_tvds == spa->spa_missing_recovered_tvds) { + spa_load_note(spa, "only alloc class devices are missing, and " + "the normal pool has copies of the alloc class data, so " + "it's still possible to import."); + return (0); + } + + /* + * If we're here, then it means that not all the missing top-level vdevs + * were alloc class devices. This should have been caught earlier. + */ + spa_load_note(spa, "some alloc class devices that do not have a " + " special_failsafe backup copy are amongst those that are missing," + " cannot import"); + + return (SET_ERROR(ENXIO)); +} + static int spa_check_for_missing_logs(spa_t *spa) { @@ -3966,7 +4097,24 @@ spa_ld_open_vdevs(spa_t *spa) error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); - if (spa->spa_missing_tvds != 0) { + if (spa->spa_missing_tvds != 0 && + spa->spa_missing_tvds == spa->spa_missing_recovered_tvds && + (error == 0 || error == ENOENT)) { + /* + * Special case: If all the missing top-level vdevs are special + * devices, we may or may not be able to import the pool, + * depending on if the relevant special_failsafe feature and + * property are set. At this early stage of import we do not + * have the feature flags loaded yet, so for now proceed + * with the import. We will do the backup checks later after + * the feature flags are loaded. + */ + spa_load_note(spa, "vdev tree has %lld missing special " + "top-level vdevs. Keep importing for now until we " + "can check the feature flags.", + (u_longlong_t)spa->spa_missing_tvds); + error = 0; + } else if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { @@ -4737,6 +4885,14 @@ spa_ld_get_props(spa_t *spa) spa->spa_autoreplace = (autoreplace != 0); } + uint64_t special_failsafe = 0; + spa_prop_find(spa, ZPOOL_PROP_SPECIAL_FAILSAFE, + &special_failsafe); + if (special_failsafe) + spa->spa_special_failsafe = B_TRUE; + else + spa->spa_special_failsafe = B_FALSE; + /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on @@ -5398,6 +5554,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) if (error != 0) goto fail; + spa_import_progress_set_notes(spa, "Checking for bad alloc class " + "devices"); + spa_check_for_bad_alloc_class_devices(spa); + if (error != 0) + return (error); + + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) @@ -6589,6 +6752,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); + /* + * Set initial special_failsafe settings. These may change after the + * nvlist properties are processed a little later in spa_sync_props(). + */ + spa->spa_special_failsafe = (boolean_t) + zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE); + if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); @@ -9487,6 +9657,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; +// boolean_t boolval; switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: @@ -9549,7 +9720,6 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; - case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; @@ -9631,6 +9801,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; + case ZPOOL_PROP_SPECIAL_FAILSAFE: + spa->spa_special_failsafe = + (boolean_t)intval; + break; default: break; } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index d1d41bbe7214..6a5aaef8596b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -738,6 +738,17 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); + + /* + * Testing showed that spa_special_failsafe needs to be on by default + * here no matter what. Later on it will be turned off since + * the feature is off by default. If you don't have it on at early + * SPA creation time, then it's impossible to import the pool with all + * the special devices missing. This could be due to the need to + * write two copies of early metadata. + */ + spa->spa_special_failsafe = B_TRUE; + spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); @@ -1682,6 +1693,9 @@ spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); + + if (spa->spa_special_failsafe) + spa_feature_incr(spa, SPA_FEATURE_SPECIAL_FAILSAFE, tx); } /* @@ -2850,10 +2864,21 @@ spa_syncing_log_sm(spa_t *spa) return (spa->spa_syncing_log_sm); } +/* + * Record the total number of missing top-level vdevs ('missing'), and the + * number of missing top-level vdevs that are recoverable ('missing_recovered'). + * In this case, missing_recovered is the number of top-level alloc class vdevs + * that are recoverable since the special_failsafe pool prop was on, and thus + * their data is "backed up" to the main pool. + * + * The separate 'missing_recovered' count is used during pool import to + * determine if we can import a pool with missing alloc class vdevs. + */ void -spa_set_missing_tvds(spa_t *spa, uint64_t missing) +spa_set_missing_tvds(spa_t *spa, uint64_t missing, uint64_t missing_recovered) { spa->spa_missing_tvds = missing; + spa->spa_missing_recovered_tvds = missing_recovered; } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c74f72159dc9..a11b4d49597c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -728,6 +728,60 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) return (vd); } +boolean_t +vdev_is_leaf(vdev_t *vd) +{ + return (vd->vdev_children == 0); +} + +/* Return true if vdev or TLD vdev is special alloc class */ +boolean_t +vdev_is_special(vdev_t *vd) +{ + if (vd->vdev_alloc_bias == VDEV_BIAS_SPECIAL) + return (B_TRUE); + + /* + * If the vdev is a leaf vdev, and is part of a mirror, its parent + * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_SPECIAL, but the + * leaf vdev itself will not. So we also need to check the parent + * in those cases. + */ + if (vdev_is_leaf(vd) && + (vd->vdev_parent != NULL && vdev_is_special(vd->vdev_parent))) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* Return true if vdev or TLD vdev is dedup alloc class */ +boolean_t +vdev_is_dedup(vdev_t *vd) +{ + if (vd->vdev_alloc_bias == VDEV_BIAS_DEDUP) + return (B_TRUE); + + /* + * If the vdev is a leaf vdev, and is part of a mirror, it's parent + * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_DEDUP, but the + * leaf vdev itself will not. So we also need to check the parent + * in those cases. + */ + if (vdev_is_leaf(vd) && + (vd->vdev_parent != NULL && vdev_is_dedup(vd->vdev_parent))) { + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +vdev_is_alloc_class(vdev_t *vd) +{ + return (vdev_is_special(vd) || vdev_is_dedup(vd)); +} + /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly @@ -746,6 +800,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); + const char *bias = NULL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -797,8 +852,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { - const char *bias; - /* * If creating a top-level vdev, check for allocation * classes input. @@ -840,6 +893,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_tsd = tsd; vd->vdev_islog = islog; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + } + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -3690,8 +3748,9 @@ vdev_load(vdev_t *vd) VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { - ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); - vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + if (vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = + vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -4150,7 +4209,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -4366,8 +4426,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) * don't allow it to be offlined. Log devices are always * expendable. */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && - vdev_dtl_required(vd)) + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); @@ -4423,7 +4483,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) vd->vdev_offline = B_TRUE; vdev_reopen(tvd); - if (!tvd->vdev_islog && vd->vdev_aux == NULL && + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); @@ -5269,10 +5330,14 @@ vdev_propagate_state(vdev_t *vd) * device, treat the root vdev as if it were * degraded. */ - if (child->vdev_islog && vd == rvd) + if ((child->vdev_islog || + vdev_is_special_failsafe(child)) && + (vd == rvd)) { degraded++; - else + } else { faulted++; + } + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } @@ -5448,8 +5513,9 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_post_state_change(spa, vd, save_state); } - if (!isopen && vd->vdev_parent) + if (!isopen && vd->vdev_parent) { vdev_propagate_state(vd->vdev_parent); + } } boolean_t @@ -5517,6 +5583,24 @@ vdev_log_state_valid(vdev_t *vd) return (B_FALSE); } +/* + * Is the vdev an alloc class vdev that is part of a pool that has + * special_failsafe on, and thus has all it's data backed up to the main pool? + * + * This function works for both top-level vdevs and leaf vdevs. + */ +boolean_t +vdev_is_special_failsafe(vdev_t *vd) +{ + if (vdev_is_alloc_class(vd)) + return (vd->vdev_spa->spa_special_failsafe); + + if (vdev_is_leaf(vd) && vd->vdev_parent != NULL) + return (vdev_is_special_failsafe(vd->vdev_parent)); + + return (B_FALSE); +} + /* * Expand a vdev if possible. */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index ed592514fded..5469409550ae 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -521,8 +521,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_removing); } - /* zpool command expects alloc class data */ - if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { @@ -539,6 +538,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } + fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } @@ -1804,9 +1804,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; + boolean_t failure_but_special_failsafe = B_FALSE; + int rc; zio = zio_root(spa, NULL, NULL, flags); - for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); @@ -1850,7 +1851,38 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) (void) zio_wait(zio); - return (good_writes >= 1 ? 0 : EIO); + /* + * Special case: + * + * If we had zero good writes, but all the writes were to alloc class + * disks that were on a pool with special_failsafe on, then it's not + * fatal. + */ + if (good_writes == 0) { + failure_but_special_failsafe = B_TRUE; + for (int v = 0; v < svdcount; v++) { + if (!vdev_is_special_failsafe(svd[v])) { + failure_but_special_failsafe = B_FALSE; + break; + } + } + } + + if (good_writes >= 1) { + /* success */ + rc = 0; + } else if (failure_but_special_failsafe) { + /* + * All the failures are on allocation class disks that were + * fully backed up to the pool, so this isn't fatal. + */ + rc = 0; + } else { + /* failure */ + rc = EIO; + } + + return (rc); } /* @@ -1966,7 +1998,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, - (vd->vdev_islog || vd->vdev_aux != NULL) ? + (vd->vdev_islog || vd->vdev_aux != NULL || + vdev_is_special_failsafe(vd)) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); @@ -2019,6 +2052,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); + flags |= ZIO_FLAG_TRYHARD; } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index e132643dc330..3833bdf89d8d 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * Virtual device vector for the pool's root vdev. @@ -46,6 +47,7 @@ vdev_root_core_tvds(vdev_t *vd) vdev_t *cvd = vd->vdev_child[c]; if (!cvd->vdev_ishole && !cvd->vdev_islog && + !vdev_is_special_failsafe(vd) && cvd->vdev_ops != &vdev_indirect_ops) { tvds++; } @@ -87,6 +89,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, spa_t *spa = vd->vdev_spa; int lasterror = 0; int numerrors = 0; + int numerrors_recovered = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -97,18 +100,25 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - if (cvd->vdev_open_error && !cvd->vdev_islog && cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; + if (vdev_is_special_failsafe(cvd)) + numerrors_recovered++; } } - if (spa_load_state(spa) != SPA_LOAD_NONE) - spa_set_missing_tvds(spa, numerrors); + if (spa_load_state(spa) != SPA_LOAD_NONE) { + spa_set_missing_tvds(spa, numerrors, numerrors_recovered); + } - if (too_many_errors(vd, numerrors)) { + if (numerrors != 0 && (numerrors == numerrors_recovered)) { + vdev_dbgmsg(vd, "there were %lu top-level errors, but they were" + " all alloc class vdevs with special_failsafe. Keep trying" + "to import.", + (long unsigned) numerrors); + } else if (too_many_errors(vd, numerrors)) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d68d5ababe79..78033064f370 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3501,6 +3501,19 @@ zio_ddt_write(zio_t *zio) ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); + /* + * Dedup writes can either to do a dedicated dedup device or to a + * dedicated special device. If we have special_failsafe on, we need + * to make an extra copy of the data to go on the pool. To do this + * we need to adjust the ZIO's copies here so the later stages in the + * ZIO pipeline work correctly. + */ + if (spa->spa_special_failsafe && zp->zp_copies == 1) { + zp->zp_copies = 2; + } + + p = zp->zp_copies; + ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; @@ -3631,6 +3644,22 @@ zio_dva_throttle(zio_t *zio) mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); + /* + * If the special_failsafe pool prop is enabled, we will do the regular + * write to the special/dedup device and an additional "backup" + * write to the normal pool. That way if the special/dedup devices + * all fail, we don't lose all data in our pool. + * + * Reserve that 2nd write to the regular pool here. The DVAs + * for both writes will later be allocated in the + * next step in the ZIO pipeline in + * zio_dva_allocate()->metaslab_alloc(). + */ + if ((spa->spa_special_failsafe && (mc == spa_special_class(spa) || + mc == spa_dedup_class(spa))) && zio->io_prop.zp_copies == 1) { + zio->io_prop.zp_copies = 2; + } + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ac2c541a9188..3e5566aa7e65 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -53,6 +53,14 @@ tags = ['functional', 'arc'] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] + +[tests/functional/special_failsafe] +tests = ['special_failsafe_add', 'special_failsafe_create', + 'special_failsafe_files', 'special_failsafe_import', + 'special_failsafe_offline', 'special_failsafe_prop', + 'special_failsafe_scrub', 'special_failsafe_split'] +tags = ['functional', 'special_failsafe'] + [tests/functional/bclone] tests = ['bclone_crossfs_corner_cases_limited', 'bclone_crossfs_data', diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index dfab48d2cdaf..7ccdd9bf12bf 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1081,6 +1081,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop=$1 + typeset pool=$2 + typeset vdev=$3 + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name @@ -1815,7 +1825,8 @@ function verify_pool function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ - grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") + grep -vEe '^-----' | \ + grep -Ev '^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|-[0-9]$') } # @@ -3907,3 +3918,28 @@ function pop_coredump_pattern ;; esac } + +# Get a list of all vdevs in the pool that are a certain type. +# +# The returned list is in a space-separated string, with the full path of each +# vdev included: +# +# "/dev/sda /dev/sdb /dev/sdc" +# +# $1: Type of disk to get ('special', 'dedup', 'log', 'cache', 'spare') +# $2: (optional) pool name +function get_list_of_vdevs_that_are { + poolname=${2:-$TESTPOOL} + + zpool status -P $poolname | sed -r '/\s+(mirror|draid|raidz)/d' | \ + awk -v token="$1" '{ + if (tmp == 1 && substr($1,1,1) == "/") { + if (first != 1) { + printf "%s", $1; + first=1; + } else { + printf " %s", $1; + } + } else {tmp=0}; if ($1 == token) {tmp=1}} + END {print ""}' +} diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 44eedcf6fae5..42b0989907f4 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -90,6 +90,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ + functional/special_failsafe/special_failsafe.cfg \ + functional/special_failsafe/special_failsafe.kshlib \ functional/bclone/bclone.cfg \ functional/bclone/bclone_common.kshlib \ functional/bclone/bclone_corner_cases.kshlib \ @@ -441,6 +443,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ + functional/special_failsafe/special_failsafe_add.ksh \ + functional/special_failsafe/special_failsafe_create.ksh \ + functional/special_failsafe/special_failsafe_files.ksh \ + functional/special_failsafe/special_failsafe_import.ksh \ + functional/special_failsafe/special_failsafe_prop.ksh \ + functional/special_failsafe/special_failsafe_offline.ksh \ + functional/special_failsafe/special_failsafe_scrub.ksh \ + functional/special_failsafe/special_failsafe_split.ksh \ + functional/special_failsafe/cleanup.ksh \ + functional/special_failsafe/setup.ksh \ functional/bclone/bclone_crossfs_corner_cases.ksh \ functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ functional/bclone/bclone_crossfs_data.ksh \ diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh index 3237d7cb784f..4ea64f8318e6 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh @@ -32,12 +32,16 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for type in special dedup; do - log_mustnot zpool create -d $TESTPOOL $CLASS_DISK0 $type $CLASS_DISK1 + +for arg in '-o special_failsafe=on' '' ; do + for type in special dedup; do + log_mustnot zpool create $args -d $TESTPOOL $CLASS_DISK0 $type \ + $CLASS_DISK1 + done + log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must display_status "$TESTPOOL" + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 -log_must display_status "$TESTPOOL" -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh index 78d40ce56d4e..7ab6552ebb0c 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh @@ -31,27 +31,29 @@ log_onexit cleanup log_must disk_setup -for type in "" "mirror" "raidz" -do - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS - - if [ "$type" = "mirror" ]; then - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 - elif [ "$type" = "raidz" ]; then - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 - else - log_must zpool add $TESTPOOL special $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - fi - - log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS + + if [ "$type" = "mirror" ]; then + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 + elif [ "$type" = "raidz" ]; then + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + else + log_must zpool add $TESTPOOL special $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + fi + + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh index 04ce486adb83..131bf79ff306 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh @@ -36,31 +36,35 @@ typeset ac_value typeset stype="" typeset sdisks="" -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" - elif [ "$type" = "raidz" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1}" - else - stype="" - sdisks="${CLASS_DISK0}" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" + elif [ "$type" = "raidz" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + else + stype="" + sdisks="${CLASS_DISK0}" + fi - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ - special $stype $sdisks + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS \ + special $stype $sdisks - ac_value="$(zpool get -H -o property,value all | awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "active" ]; then - log_note "feature@allocation_classes is active" - else - log_fail "feature@allocation_classes not active, \ - status = $ac_value" - fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" + if [ "$ac_value" = "active" ]; then + log_note "feature@allocation_classes is active" + else + log_fail "feature@allocation_classes not active, \ + status = $ac_value" + fi - log_must zpool destroy -f $TESTPOOL + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh index 08c703e21acb..6e74b0a6b465 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh @@ -34,38 +34,44 @@ log_must disk_setup typeset ac_value -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - log_must zpool create $TESTPOOL $type $ZPOOL_DISK0 $ZPOOL_DISK1 - else - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS - fi - ac_value="$(zpool get -H -o property,value all | \ - awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "enabled" ]; then - log_note "feature@allocation_classes is enabled" - else - log_fail "feature@allocation_classes not enabled, \ - status = $ac_value" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISK0 \ + $ZPOOL_DISK1 + else + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS + fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" + if [ "$ac_value" = "enabled" ]; then + log_note "feature@allocation_classes is enabled" + else + log_fail "feature@allocation_classes not enabled, \ + status = $ac_value" + fi - if [ "$type" = "" ]; then - log_must zpool add $TESTPOOL special $CLASS_DISK0 - else - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - fi - ac_value="$(zpool get -H -o property,value all | \ - awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "active" ]; then - log_note "feature@allocation_classes is active" - else - log_fail "feature@allocation_classes not active, \ - status = $ac_value" - fi + if [ "$type" = "" ]; then + log_must zpool add $TESTPOOL special $CLASS_DISK0 + else + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" - log_must zpool destroy -f $TESTPOOL + if [ "$ac_value" = "active" ]; then + log_note "feature@allocation_classes is active" + else + log_fail "feature@allocation_classes not active, \ + status = $ac_value" + fi + + log_must zpool destroy -f $TESTPOOL + done done log_pass "Values of allocation_classes feature flag correct." diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh index 5852b2876e89..fc20fea6d096 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh @@ -32,10 +32,14 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL \ - mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ - special mirror $CLASS_DISK0 $CLASS_DISK1 -log_must zpool split $TESTPOOL split_pool -log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + log_must zpool split $TESTPOOL split_pool + log_must zpool import -d $(dirname $CLASS_DISK1) split_pool + log_must zpool destroy -f $TESTPOOL + log_must zpool destroy -f split_pool +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh index 106a6d933aac..a08732e6248f 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh @@ -31,11 +31,13 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \ - special mirror $CLASS_DISK0 $CLASS_DISK1 -log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 -log_must sleep 10 -log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 -log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 + log_must sleep 10 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 + log_must zpool destroy -f $TESTPOOL +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh index f73fbbe38c9b..2ac1024e351d 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh @@ -35,22 +35,24 @@ typeset special_type="" typeset create_disks="" typeset added_disks="" -for type in "" "raidz" -do - if [ "$type" = "raidz" ]; then - special_type="mirror" - create_disks="${CLASS_DISK0} ${CLASS_DISK1}" - added_disks="${CLASS_DISK2} ${CLASS_DISK3}" - else - special_type="" - create_disks="${CLASS_DISK0}" - added_disks="${CLASS_DISK1}" - fi - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ - special $special_type $create_disks - log_must zpool add $TESTPOOL special $special_type $added_disks - log_must zpool iostat $TESTPOOL $added_disks - log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + for type in "" "raidz" + do + if [ "$type" = "raidz" ]; then + special_type="mirror" + create_disks="${CLASS_DISK0} ${CLASS_DISK1}" + added_disks="${CLASS_DISK2} ${CLASS_DISK3}" + else + special_type="" + create_disks="${CLASS_DISK0}" + added_disks="${CLASS_DISK1}" + fi + log_must zpool create $args$TESTPOOL $type $ZPOOL_DISKS \ + special $special_type $create_disks + log_must zpool add $TESTPOOL special $special_type $added_disks + log_must zpool iostat $TESTPOOL $added_disks + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh index e8061fdabcbd..db9fa468eab2 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh @@ -35,35 +35,39 @@ typeset stype="" typeset sdisks="" typeset props="" -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" - props="-o ashift=12" - elif [ "$type" = "raidz" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1}" - else - stype="" - sdisks="${CLASS_DISK0}" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" + props="-o ashift=12" + elif [ "$type" = "raidz" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + else + stype="" + sdisks="${CLASS_DISK0}" + fi - # - # 1/3 of the time add the special vdev after creating the pool - # - if [ $((RANDOM % 3)) -eq 0 ]; then - log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS - log_must zpool add ${props} $TESTPOOL special $stype $sdisks - else - log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS \ - special $stype $sdisks - fi + # + # 1/3 of the time add the special vdev after creating the pool + # + if [ $((RANDOM % 3)) -eq 0 ]; then + log_must zpool create $arg ${props} $TESTPOOL $type \ + $ZPOOL_DISKS + log_must zpool add ${props} $TESTPOOL special $stype \ + $sdisks + else + log_must zpool create $arg ${props} $TESTPOOL $type \ + $ZPOOL_DISKS special $stype $sdisks + fi - log_must zpool export $TESTPOOL - log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL - log_must display_status $TESTPOOL - log_must zpool destroy -f $TESTPOOL + log_must zpool export $TESTPOOL + log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL + log_must display_status $TESTPOOL + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh index cbf5cbf89bdc..913f03f72fcb 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh @@ -32,19 +32,22 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - -for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 -do - log_must zfs set special_small_blocks=$value $TESTPOOL - ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ - awk '/special_small_blocks/ {print $3}') - if [ "$ACTUAL" != "$value" ] - then - log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value!" - fi +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + + for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 + do + log_must zfs set special_small_blocks=$value $TESTPOOL + ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ + awk '/special_small_blocks/ {print $3}') + if [ "$ACTUAL" != "$value" ] + then + log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value" + fi + done + + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index 0be49b858758..ffc8b84468dc 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -32,13 +32,17 @@ log_assert $claim log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 33554432 -do - log_mustnot zfs set special_small_blocks=$value $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + + for value in 256 1025 33554432 + do + log_mustnot zfs set special_small_blocks=$value $TESTPOOL + done + + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh index 0b1c18bafdaf..16d25a3f282a 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh @@ -25,20 +25,20 @@ verify_runnable "global" # -# Verify the file identified by the input is written on a special vdev -# According to the pool layout used in this test vdev_id 3 and 4 are special -# XXX: move this function to libtest.shlib once we get "Vdev Properties" +# Given a dataset and an inode number, return a list of all the vdev numbers +# that the inode has blocks on. # -function file_in_special_vdev # +# For example, if the inode has blocks on vdevs 0, 1 and 2, this would return +# the string "0 1 2" +# +function vdevs_file_is_on # { typeset dataset="$1" typeset inum="$2" - typeset num_normal=$(echo $ZPOOL_DISKS | wc -w) - num_normal=${num_normal##* } - - zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ + zdb -dddddd $dataset $inum | awk ' +/L0 [0-9]+/{ # find DVAs from string "offset level dva" only for L0 (data) blocks -if (match($0,"L0 [0-9]+")) { +# if (match($0,"L0 [0-9]+")) { dvas[0]=$3 dvas[1]=$4 dvas[2]=$5 @@ -50,25 +50,46 @@ if (match($0,"L0 [0-9]+")) { print "Error parsing DVA: <" dva ">"; exit 1; } - # verify vdev is "special" - if (arr[1] < d) { - exit 1; - } + count[arr[1]]++; } } -}}' +#} +} +END { + # Print out the unique vdev numbers that had data + firstprint=1; + for (i in count) { + if (firstprint==1) { + printf("%d", i); + firstprint=0; + } else { + printf(" %d", i); + } + } +} +' } # # Check that device removal works for special class vdevs # +# $1: Set to 1 to backup alloc class data to the pool. Leave blank to disable +# backup. function check_removal { + typeset backup + if [ "$1" == "1" ] ; then + backup=1 + args="-o special_failsafe=on" + else + backup=0 + args="" + fi + # # Create a non-raidz pool so we can remove top-level vdevs # - log_must disk_setup - log_must zpool create $TESTPOOL $ZPOOL_DISKS \ + log_must zpool create $args $TESTPOOL $ZPOOL_DISKS \ special $CLASS_DISK0 special $CLASS_DISK1 log_must display_status "$TESTPOOL" @@ -93,19 +114,49 @@ function check_removal for i in 1 2 3 4; do dataset="$TESTPOOL/$TESTFS" inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)" - log_must file_in_special_vdev $dataset $inum + + # Get a list of all the vdevs 'testfile.$i' has blocks on. + # The list will be string like "0 1 2 3" if the blocks are on + # vdevs 0-3. + on_vdevs="$(vdevs_file_is_on $dataset $inum)" + + # Get the number of normal (non-special) pool disks + num_pool_disks=$(echo $ZPOOL_DISKS | wc -w) + num_pool_disks=${num_pool_disks##* } + + if [ "$backup" == "1" ] ; then + # Data should be on all vdevs (both pool and special + # devices). + lowest_data_disk=0 + highest_data_disk=$(($num_pool_disks + 1)) + else + + # Data should only be on special devices + lowest_data_disk=$num_pool_disks + highest_data_disk=$(($lowest_data_disk + 1)) + fi + + # Get the starting disks that we expect the data to be on. + # We assume two special devices are attached to the pool. + # Disk numbers start at zero. + expected_on_vdevs="$(seq -s ' ' $lowest_data_disk $highest_data_disk)" + + # Compare the disks we expect to see the blocks on with + # the actual disks they're on. + if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then + # Data distribution is not what we expected, break out of + # the loop so we can properly tear down the pool. We will + # error out after the loop. + break; + fi done log_must zpool remove $TESTPOOL $CLASS_DISK0 - - sleep 5 - sync_pool $TESTPOOL - sleep 1 - - log_must zdb -bbcc $TESTPOOL - log_must zpool list -v $TESTPOOL log_must zpool destroy -f "$TESTPOOL" - log_must disk_cleanup + + if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then + log_fail "Expected data on disks $expected_on_vdevs, got $on_vdevs" + fi } claim="Removing a special device from a pool succeeds." @@ -113,12 +164,15 @@ claim="Removing a special device from a pool succeeds." log_assert $claim log_onexit cleanup -typeset CLASS_DEVSIZE=$CLASS_DEVSIZE -for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do - typeset ZPOOL_DISKS=$ZPOOL_DISKS - for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do - check_removal +log_must disk_setup +for backup in "1" "" ; do + typeset CLASS_DEVSIZE=$CLASS_DEVSIZE + for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do + typeset ZPOOL_DISKS=$ZPOOL_DISKS + for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do + check_removal $backup + done done done - +log_must disk_cleanup log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh index 624cab88af0c..789bf816eabb 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh @@ -33,31 +33,34 @@ log_onexit cleanup # Create a non-raidz pool so we can remove top-level vdevs # log_must disk_setup -log_must zpool create $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0 -log_must display_status "$TESTPOOL" -# -# Generate some dedup data in the dedup class before removal -# +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0 + log_must display_status "$TESTPOOL" -log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL -block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL" -log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null" + # + # Generate some dedup data in the dedup class before removal + # -sync_pool -log_must zpool list -v $TESTPOOL + log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL + block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL" + log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null" -# -# remove a dedup allocation vdev -# -log_must zpool remove $TESTPOOL $CLASS_DISK0 + sync_pool + log_must zpool list -v $TESTPOOL + + # + # remove a dedup allocation vdev + # + log_must zpool remove $TESTPOOL $CLASS_DISK0 -sleep 5 -sync_pool $TESTPOOL -sleep 1 + sleep 5 + sync_pool $TESTPOOL + sleep 1 -log_must zdb -bbcc $TESTPOOL + log_must zdb -bbcc $TESTPOOL -log_must zpool destroy -f "$TESTPOOL" + log_must zpool destroy -f "$TESTPOOL" +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh index 1b52014fd2d9..aae7ecbe9568 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh @@ -26,13 +26,15 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for size in 512 4096 32768 131072 524288 1048576 -do - let bigger=$size*2 - log_mustnot zpool create -O recordsize=$size \ - -O special_small_blocks=$bigger \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 +for arg in '-o special_failsafe=on' '' ; do + for size in 512 4096 32768 131072 524288 1048576 + do + let bigger=$size*2 + log_mustnot zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$bigger \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh index 49c468af6702..3922f8cb7bf9 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh @@ -26,20 +26,22 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for size in 8192 32768 131072 524288 1048576 -do - let smaller=$size/2 - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$smaller \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" - - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$size \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" +for arg in '-o special_failsafe=on' '' ; do + for size in 8192 32768 131072 524288 1048576 + do + let smaller=$size/2 + log_must zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$smaller \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" + + log_must zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$size \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 6ebce9459190..62388d7dbc72 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -61,6 +61,7 @@ typeset -a properties=( "bcloneused" "bclonesaved" "bcloneratio" + "special_failsafe" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" @@ -87,6 +88,7 @@ typeset -a properties=( "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" + "feature@special_failsafe" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh new file mode 100755 index 000000000000..5681caecfc52 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh @@ -0,0 +1,27 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018, Delphix +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +verify_runnable "global" + +default_cleanup_noexit +disk_cleanup + +log_pass diff --git a/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh new file mode 100755 index 000000000000..5c2e45c8dc2e --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh @@ -0,0 +1,24 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +verify_runnable "global" + +disk_cleanup + +log_pass diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg new file mode 100644 index 000000000000..84200593eb38 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg @@ -0,0 +1,36 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +export ZPOOL_DISK0="$TEST_BASE_DIR/device-0" +export ZPOOL_DISK1="$TEST_BASE_DIR/device-1" +export ZPOOL_DISK2="$TEST_BASE_DIR/device-2" +export ZPOOL_DISKS="${ZPOOL_DISK0} ${ZPOOL_DISK1} ${ZPOOL_DISK2}" + +export CLASS_DISK0="$TEST_BASE_DIR/device-3" +export CLASS_DISK1="$TEST_BASE_DIR/device-4" +export CLASS_DISK2="$TEST_BASE_DIR/device-5" +export CLASS_DISK3="$TEST_BASE_DIR/device-6" +export CLASS_DISK4="$TEST_BASE_DIR/device-7" +export CLASS_DISK5="$TEST_BASE_DIR/device-8" + +export CLASS_DISKS="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2} ${CLASS_DISK3} ${CLASS_DISK4} ${CLASS_DISK5}" + +export ZPOOL_DEVSIZE=200M +export CLASS_DEVSIZE=200M + +export IMPORTDIR="$TEST_BASE_DIR" diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib new file mode 100644 index 000000000000..21aa6acd9aca --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib @@ -0,0 +1,255 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.cfg + +BACKUP_DIR=$TEST_BASE_DIR/backups + +function disk_setup +{ + truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS + truncate -s $CLASS_DEVSIZE $CLASS_DISKS + + if [ -d $BACKUP_DIR ] ; then + log_fail "Existing $TEST_BASE_DIR/backups directory (maybe leftover from failed test run?)" + fi + + mkdir -p $BACKUP_DIR +} + +function disk_cleanup +{ + rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null + rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null + + rm -f special_failsafe.key + rm -fr $BACKUP_DIR +} + +function cleanup +{ + if datasetexists $TESTPOOL ; then + zpool destroy -f $TESTPOOL 2> /dev/null + fi + + disk_cleanup +} + +# Write zeros to an existing file, keeping the same size. +function zero_file { + dd status=none if=/dev/zero of="$1" bs=$(stat_size "$1") count=1 +} + +# Write a verifiable file that will end up on a 'dedup' or 'special' vdev. +# The filename will include the sha256 of the file for easy verification later. +# +# $1: Write type - "dedup" or "special" +# $2: Path to directory to write the file to +# +# Note: we don't use log_must here since this can get really chatty and +# we don't want to spam the logs. It will log_fail if there is an error. +function write_verifiable_file { + class="$1" + writedir="$2" + + if [[ "$class" == "dedup" ]] ; then + # Our dedup file size can be up to a megabyte-ish + filesize=$((32768 + ($RANDOM * $RANDOM % 1000000))) + + # Make write a multiple of the recordsize for dedup + bs=32768 + count=$(($filesize / $bs)) + + # Fill data with the letter 'a' for dedup + file_write -b $bs -c $count -d 'a' -o create -f $writedir/tmp || return + else + # Make all files less than the 32k special_small_blocks size we + # setup at dataset creation time + filesize=$((($RANDOM % 32767) + 1)) + bs=$filesize + count=1 + dd status=none if=/dev/urandom bs=$bs count=$count of="$writedir/tmp" || return + fi + + + csum=$(sha256digest "$writedir/tmp") + newfile=$csum.$class$totalwritten + mv "$writedir/tmp" "$writedir/$newfile" + + # Basic sanity that we created our final file, and it has a non-zero size + expectedsize=$(($bs * $count)) + actualsize=$(stat_size "$writedir/$newfile") + if [[ "$actualsize" != "$expectedsize" ]] || [[ "$actualsize" == "0" ]] ; then + log_fail "File $writedir/$newfile bad size $actualsize (expected $expectedsize)" + return + fi + + totalwritten=$(($totalwritten + 1)) +} + +# Write some files to all our datasets. +# +# For each dataset: +# +# - 10 files should hit special vdevs +# - 10 files should hit dedup vdevs +function write_some_files { + typeset i + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + for j in $(seq 1 10) ; do + write_verifiable_file special /$TESTPOOL/$i + write_verifiable_file dedup /$TESTPOOL/$i + done + done +} + +# Given a directory containing only files created by write_verifiable_file(), +# verify that the contents of the file match the sha256sum in the file's name. +# +# $1: Dir path with files to verify +function verify_directory { + typeset verifydir="$1" + typeset i + for i in $(ls $verifydir) ; do + + # Files will look like: + # + # ed324386045fa39d3f41d4f13c8c3e6a4698466e2b694c327f7e490be9e4e33f.dedup13 + # + # Just grab the sha256 part + + shaname="$(echo $i | cut -f1 -d'.')" + if [[ $(sha256digest "$verifydir/$i") != "$shaname" ]] ; then + log_fail "$verifydir/$i sha256 not $shaname" + false + return + fi + done + true +} + +function backup_alloc_class_disks { + typeset i + for i in $@ ; do + cp ${i} $BACKUP_DIR/$(basename $i) + done +} + +function restore_alloc_class_disks { + typeset i + for i in $@ ; do + mv $BACKUP_DIR/$(basename $i) ${i} + done +} + +function zero_alloc_class_disks { + typeset i + for i in $@ ; do + zero_file "${i}" + done +} + +# Create multiple datasets with different permutations of copies and encryption +function special_failsafe_make_datasets { + + log_must zfs create -o compression=off -o special_small_blocks=32K -o recordsize=32K \ + -o dedup=on $TESTPOOL/$TESTFS + + keyfile=$(pwd)/special_failsafe.key + dd if=/dev/urandom of=$keyfile bs=32 count=1 + + log_must zfs create -o copies=2 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/2copies + + log_must zfs create -o copies=3 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/3copies + + log_must zfs create -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted + + log_must zfs create -o copies=2 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted2copies + + log_must zfs create -o copies=3 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted3copies +} + +# For each dataset we created in special_failsafe_make_datasets, go though +# and check that all the files in the datasets have the correct data. +function verify_all_directories { + typeset i + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + verify_directory /$TESTPOOL/$i + done + + # ...we should also have the correct number of files + totalfiles=0 + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + totalfiles=$(($totalfiles + $(ls /$TESTPOOL/$i | wc -w))) + done + + if [[ "$totalfiles" != "$totalwritten" ]] ; then + log_fail "Wrong file count: expected $totalwritten, got $totalfiles" + else + log_note "Verified $totalfiles files" + fi +} + +# Return a space separated string of disks that are alloc class vdevs. Disk +# names will include the full path. +function get_list_of_alloc_class_disks { + typeset special_disks=$(get_list_of_vdevs_that_are "special") + typeset dedup_disks=$(get_list_of_vdevs_that_are "dedup") + typeset disks="$dedup_disks" + + if [ -n "$special_disks" ] ; then + disks="$special_disks $disks" + fi + + echo "$disks" +} + +# Check that the pool/vdev proprieties and features for alloc class backups +# are sane. +function check_pool_alloc_class_props { + typeset special_failsafe_feature=$(get_pool_prop feature@special_failsafe $TESTPOOL) + typeset special_failsafe_prop=$(get_pool_prop special_failsafe $TESTPOOL) + if [ "$special_failsafe_feature" == "disabled" ] ; then + log_must [ "$special_failsafe_prop" == "off" ] + fi +} + +# Simple function to check pool and vdev proprieties are what we expect. The +# values we expect are passed to this function: +# +# $1: 'feature@special_failsafe' pool feature +# $2: 'special_failsafe' pool prop +# +# This function will log_fail on error. +function boilerplate_check { + typeset special_failsafe_feature=$1 + typeset special_failsafe_prop=$2 + + if [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" != "$special_failsafe_feature" ] ; then + log_fail "feature@special_failsafe = $(get_pool_prop feature@special_failsafe $TESTPOOL), expected $special_failsafe_feature" + fi + + if [ "$(get_pool_prop special_failsafe $TESTPOOL)" != "$special_failsafe_prop" ] ; then + log_fail "special_failsafe = $(get_pool_prop special_failsafe $TESTPOOL), expected $special_failsafe_prop" + fi +} diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh new file mode 100755 index 000000000000..36ff874cb00e --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify that 'zpool add' and 'zpool attach' disks have the correct +# special_failsafe settings. + +verify_runnable "global" + +claim="zpool add|attach disks have correct special_failsafe settings" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="mirror $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK0 $ZPOOL_DISK1 dedup mirror $CLASS_DISK0 $CLASS_DISK1" + +log_must disk_setup + +function do_test { + typeset config="$1" + typeset initial=$2 + typeset new=$3 + + log_must zpool create -o special_failsafe=$initial $TESTPOOL $config + totalwritten=0 + + # Sanity check that feature@special_failsafe aligns with the + # pool prop + if [ $initial == "on" ] ; then + feature_expected="active" + else + feature_expected="enabled" + fi + boilerplate_check "$feature_expected" "$initial" + + special_failsafe_make_datasets + write_some_files + + if [ $initial != "off" ] ; then + log_must zpool set special_failsafe=$new $TESTPOOL + fi + + write_some_files + + # Now add a new special/dedup disk to the special mirror + log_must zpool attach $TESTPOOL $CLASS_DISK0 $CLASS_DISK2 + write_some_files + + # Add another special & dedup disk in RAID0 with the existing + # special mirror + log_must zpool add $TESTPOOL special $CLASS_DISK3 + log_must zpool add $TESTPOOL dedup $CLASS_DISK4 + + write_some_files + verify_all_directories + + log_must zpool export $TESTPOOL + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + zero_alloc_class_disks $alloc_class_disks + + log_must zpool import -l -d $IMPORTDIR $TESTPOOL + + verify_all_directories + + log_must zpool destroy $TESTPOOL +} + +# Create a pool that is initially not special_failsafe. Then, enable +# special_failsafe and add/attach a disk. +echo "$configs" | while read config ; do + for initial in "on" "off" ; do + for new in "on" "off" ; do + do_test "$config" $initial $new + done + done +done + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh new file mode 100755 index 000000000000..1905fba16073 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# DESCRIPTION: +# Verify 'zpool create' with different alloc class redundancy +# levels will correctly succeed or fail. + +verify_runnable "global" + +claim="zpool create with different special_failsafe and disk permutations work" + +log_assert $claim +log_onexit cleanup + +# These should always pass since they have same redundancy level +configs_pass="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3" + +# These should always pass with special_failsafe enabled or when '-f' is passed. +# They should fail otherwise. +configs_pass_failsafe="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup $CLASS_DISK0 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 dedup $CLASS_DISK2 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2" + +log_must disk_setup + +# Try configs with matching redundancy levels. They should all pass. +echo "$configs_pass" | while read config ; do + log_must zpool create -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL +done + +# Try configs with lower redundancy level. They should fail if special_failsafe +# is turned off and -f is not used. +echo "$configs_pass_failsafe" | while read config ; do + log_mustnot zpool create -o feature@special_failsafe=disabled $TESTPOOL $config + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + + log_must zpool create -f -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_mustnot zpool create -o feature@special_failsafe=enabled -o special_failsafe=off $TESTPOOL $config +done + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh new file mode 100755 index 000000000000..808df272a4c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Test multiple different special_failsafe permutations. After each step +# write a bunch of known files. Verify all files are present and correct +# after all the steps are complete. + +verify_runnable "global" + +claim="Files on special_failsafe enabled disks do not get corrupted" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +echo "$configs" | while read config ; do + log_must disk_setup + log_must zpool create -o special_failsafe=on $TESTPOOL $config + totalwritten=0 + special_failsafe_make_datasets + + write_some_files + verify_all_directories + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + log_must zpool export $TESTPOOL + + backup_alloc_class_disks $alloc_class_disks + zero_alloc_class_disks $alloc_class_disks + + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # Our pool is imported but has all its special devices zeroed out. Try + # writing some files to it and export the pool + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + write_some_files + + # Make our old disks appear again (which have older data). Do a zpool + # clear to make them come back online and resilver. + restore_alloc_class_disks $alloc_class_disks + log_must zpool clear $TESTPOOL + + write_some_files + + # At this point the pool should be normal. The next test is to + # corrupt the alloc class devices while the pool is running. + zero_alloc_class_disks $alloc_class_disks + + # Trigger a scrub with our newly-zeroed alloc class disks + log_must zpool scrub $TESTPOOL + + # The pool should be degraded, but still alive. + check_state $TESTPOOL "" "DEGRADED" + + write_some_files + + # Replace all the alloc class disks. This should get the pool + # back to normal. + for disk in $alloc_class_disks ; do + log_must zpool replace $TESTPOOL $disk + done + + write_some_files + + log_must zpool export $TESTPOOL + + # Backup special disks, then totally remove them. + backup_alloc_class_disks $alloc_class_disks + + rm -f $alloc_class_disks + + # Try to import with the alloc class disks missing - it should work. + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # After all the pain we've put our pool though, it should still have all the + # correct file data. + log_must verify_all_directories + + if [[ "$totalwritten" != "840" ]] ; then + log_fail "Didn't see 840 files, saw $totalwritten" + fi + + # We've checked all the files. Do some more verifications. + verify_pool $TESTPOOL + verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR + + # Record a few stats that show metadata re in use + zpool get dedup $TESTPOOL + zdb -bb $TESTPOOL 2>&1 | grep -Ei 'normal|special|dedup|ddt' + + log_must zpool destroy $TESTPOOL + cleanup +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh new file mode 100755 index 000000000000..d8ba52c702b3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can import a special_failsafe pool even if all its alloc class +# devices are missing. +# +verify_runnable "global" + +claim="Verify imports work on special_failsafe pools when vdevs missing" + +log_assert $claim +log_onexit cleanup + +TWO_ZPOOL_DISKS="$ZPOOL_DISK0 $ZPOOL_DISK1" +REPLACE_DISK="$ZPOOL_DISK2" + +# Try a bunch of different pool configurations +configs="$TWO_ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $TWO_ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$TWO_ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$TWO_ZPOOL_DISKS special $CLASS_DISK0 +$TWO_ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + typeset config="$1" + typeset action="$2" + typeset onoff="$3" + + totalwritten=0 + log_must disk_setup + log_must zpool create -o special_failsafe=$onoff $TESTPOOL $config + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + + special_failsafe_make_datasets + write_some_files + verify_all_directories + + log_must zpool export $TESTPOOL + + # Backup alloc class disk before removing them + backup_alloc_class_disks $alloc_class_disks + if [ "$action" == "remove" ] ; then + rm -f $alloc_class_disks + else + zero_alloc_class_disks $alloc_class_disks + fi + + # import should succeed or fail depending on how we're backed up + if [ "$onoff" == "on" ] ; then + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + else + log_mustnot zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # With the disks restored, we should be able to import + restore_alloc_class_disks $alloc_class_disks + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + fi + write_some_files + + # Do a scrub and verify everything is correct + verify_pool $TESTPOOL + + verify_all_directories + + zpool destroy $TESTPOOL + + cleanup +} + +echo "$configs" | while read config ; do + for action in "remove" "zero" ; do + for onoff in "off" "on" ; do + do_test "$config" "$action" "$onoff" + done + done +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh new file mode 100755 index 000000000000..8f5722dfd8d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can offline special_failsafe alloc class disks. +# Verify we cannot offline non-special_failsafe alloc class disks. +# +verify_runnable "global" + +claim="Verify correct behavior when we force fault an alloc class disk" + +log_assert $claim +log_onexit cleanup + +# Try a bunch of different pool configurations +configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + prop="$1" + config="$2" + log_must disk_setup + log_must zpool create -f $prop $TESTPOOL $config + check_pool_alloc_class_props + + special_failsafe_make_datasets + totalwritten=0 + write_some_files + + alloc_class_disks=$(get_list_of_alloc_class_disks) + alloc_class_disks_arr=($alloc_class_disks) + + if [ "$prop" == "-o special_failsafe=on" ] ; then + log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "active" ] + else + log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "enabled" ] + fi + + for ((i = 0; i < ${#alloc_class_disks_arr[@]}; i++)); do + disk="${alloc_class_disks_arr[$i]}" + if [ "$prop" == "-o special_failsafe=on" ] ; then + # Everything is backed-up. We should be able to + # offline all the disks. + log_must zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "OFFLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + else + PARENT=$(get_vdev_prop parent $TESTPOOL $disk) + if [ "$PARENT" == "$TESTPOOL" ] ; then + # Leaf is TLD, offline should fail + log_mustnot zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "ONLINE" + log_must check_state $TESTPOOL "" "ONLINE" + else + # We're part of a mirror. We know all + # mirrors in our test pool are two disk + # so we should be able to offline the + # first disk, but not the second. + if [ "$i" == "0" ] ; then + # First alloc class disk - pretend + # "previous" disk was online to + # make things easy. + prev_online=1 + else + if check_state $TESTPOOL "${alloc_class_disks_arr[$i - 1]}" "ONLINE" ; then + prev_online=1 + else + prev_online=0 + fi + fi + + if [ "$prev_online" == "1" ] ; then + # First disk in mirror, can offline + log_must zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "OFFLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + else + # Second disk in mirror, can't offline + # but we should still be in a pool + # degraded state from the first disk + # going offline. + log_mustnot zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "ONLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + fi + fi + fi + done + + write_some_files + verify_all_directories + + # We've checked all the files. Do some more verifications. + verify_pool $TESTPOOL + verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR + + zpool clear $TESTPOOL + zpool destroy $TESTPOOL + cleanup +} + +for prop in "-o special_failsafe=on" "" ; do + echo "$configs" | while read config ; do + do_test "$prop" "$config" + done +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh new file mode 100755 index 000000000000..3cf94e0dc11f --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh @@ -0,0 +1,133 @@ +#!/bin/ksh -p + +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify that special_failsafe prop does not work if +# SPA_FEATURE_SPECIAL_FAILSAFE is disabled. Also, test upgrades. + +verify_runnable "global" + +claim="special_failsafe prop shouldn't work without SPA_FEATURE_SPECIAL_FAILSAFE" + +log_assert $claim +log_onexit cleanup + +# Try a bunch of different pool configurations +configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +# Make the pool disks smaller to make them quicker to back up. We don't use +# much data on them. +export ZPOOL_DEVSIZE=200M +export CLASS_DEVSIZE=200M + +log_must disk_setup + +echo "$configs" | while read config ; do + # We should not be able to set special_failsafe=on if the feature + # flag is disabled. + log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=on $TESTPOOL $config + + # Try a few permutations that should succeed + log_must zpool create -o special_failsafe=off $TESTPOOL $config + boilerplate_check "enabled" "off" + log_must zpool destroy $TESTPOOL + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + boilerplate_check "active" "on" + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config + boilerplate_check "active" "on" + log_must zpool destroy $TESTPOOL +done + +# Now let's do a multi-step test where we upgrade an older pool +for cmd in "zpool set feature@special_failsafe=enabled $TESTPOOL" "zpool upgrade $TESTPOOL" ; do + + # Make a pool with no special devices + log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL mirror $ZPOOL_DISKS + totalwritten=0 + + boilerplate_check "disabled" "off" + special_failsafe_make_datasets + write_some_files + + # Test enabling the feature in two different ways: + # + # zpool set feature@special_failsafe=enabled ... + # zpool upgrade ... + # + log_must eval "$cmd" + boilerplate_check "enabled" "off" + write_some_files + + # Shouldn't be able to add with special_failsafe prop off + log_mustnot zpool add $TESTPOOL special $CLASS_DISK0 + + log_must zpool set special_failsafe=on $TESTPOOL + boilerplate_check "enabled" "on" + write_some_files + + log_must zpool add $TESTPOOL special $CLASS_DISK0 + + boilerplate_check "active" "on" + + write_some_files + + zpool add $TESTPOOL dedup $CLASS_DISK1 + + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d $IMPORTDIR $TESTPOOL + + verify_all_directories + + # You should be able to turn special_failsafe off if it was on + log_must zpool set special_failsafe=off $TESTPOOL + + boilerplate_check "active" "off" + + # If special_failsafe prop was on and the feature active, and then you + # turned the prop off, you cannot turn it back on again. + log_mustnot zpool set special_failsafe=on $TESTPOOL + + log_must zpool destroy $TESTPOOL +done + +# Verify the special_failsafe prop persists across imports +log_must zpool create -o special_failsafe=on $TESTPOOL $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 +log_must zpool export $TESTPOOL +log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL +typeset prop=$(get_pool_prop special_failsafe $TESTPOOL) +log_must [ "$prop" == "on" ] +log_must zpool destroy $TESTPOOL + +log_must zpool create $TESTPOOL $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 +log_must zpool export $TESTPOOL +log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL +typeset prop=$(get_pool_prop special_failsafe $TESTPOOL) +log_must [ "$prop" == "off" ] +log_must zpool destroy $TESTPOOL + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh new file mode 100755 index 000000000000..7ccb32b7bf82 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Destroy alloc class disks and then do a scrub on both a +# special_failsafe and non-special_failsafe pool. The special_failsafe +# pool should only be DEGRADED, while the non-special_failsafe pool should +# be SUSPENDED. + +verify_runnable "global" + +claim="special_failsafe pools survive a normally fatal scrub with bad disks" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + typeset config="$1" + typeset action="$2" + typeset onoff="$3" + totalwritten=0 + + log_must disk_setup + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=$onoff $TESTPOOL $config + + special_failsafe_make_datasets + + totalwritten=0 + write_some_files + + # When we do a scrub later, we will either want it to suspend or not + # suspend the pool, depending on our backup settings. Make sure we are + # able to ride though the suspended pool so we # can continue with our + # tests. + log_must zpool set failmode=continue $TESTPOOL + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + backup_alloc_class_disks $alloc_class_disks + zero_alloc_class_disks $alloc_class_disks + + # Spawn scrub into the background since the pool may be suspended and + # it will hang. We need to continue passed the hung scrub so we + # can restore the bad disks and do a 'zpool clear' to remove the + # suspended pool. + zpool scrub $TESTPOOL & + + wait_scrubbed $TESTPOOL 3 + if [ "$onoff" == "on" ] ; then + log_must check_state $TESTPOOL "" "DEGRADED" + + verify_pool $TESTPOOL + + write_some_files + verify_all_directories + else + log_must check_state $TESTPOOL "" "SUSPENDED" + + # Pool should be suspended. Restore the old disks so we can + # clear the suspension. 'zpool clear' here will delete the + # pool. + restore_alloc_class_disks $alloc_class_disks + log_must zpool clear $TESTPOOL + fi + + cleanup +} + +# Stop zed in case we left it running from an old, aborted, test run. +zed_stop +zed_cleanup + +log_must zed_setup +log_must zed_start +log_must zed_events_drain + +# Verify scrubs work as expected with different permutations of special_failsafe +echo "$configs" | while read config ; do + for i in "on" "off" ; do + do_test "$config" "zero" "$i" + done +done + +log_must zed_stop +log_must zed_cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh new file mode 100755 index 000000000000..79a3008740fc --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can split a pool with special_failsafe, and the new pool +# keeps the special_failsafe settings. Also verify the new pool has +# all the data if the pool has special_failsafe. +# +verify_runnable "global" + +claim="zpool split works with special_failsafe" + +log_assert $claim +log_onexit cleanup + +# Create a normal, special_failsafe pool +log_must disk_setup +log_must zpool create -o special_failsafe=on $TESTPOOL mirror \ + $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \ + mirror $CLASS_DISK2 $CLASS_DISK3 + +totalwritten=0 +special_failsafe_make_datasets +write_some_files +verify_all_directories + +# Split the pool and verify the old pool has all the data +newpool="${TESTPOOL}-2" + +log_must zpool split $TESTPOOL $newpool +check_pool_alloc_class_props +verify_all_directories + +# Forcefault alloc class devices on the old pool and verify we have all the +# data. +log_must zpool offline -f $TESTPOOL $CLASS_DISK0 +log_must zpool offline -f $TESTPOOL $CLASS_DISK2 +log_must check_state $TESTPOOL $CLASS_DISK0 "FAULTED" +log_must check_state $TESTPOOL $CLASS_DISK2 "FAULTED" + +log_must check_state $TESTPOOL "" "DEGRADED" +verify_all_directories + +log_must zpool clear $TESTPOOL + +# All done with the old pool +log_must zpool destroy $TESTPOOL + +# Import the new split pool and rename it $TESTPOOL since all our verification +# functions expect the pool to be called $TESTPOOL. +log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL + +check_pool_alloc_class_props +verify_all_directories + +# zero alloc class devices on the old pool and verify we have all the +# data. +log_must zpool export $TESTPOOL + +zero_file $CLASS_DISK1 +zero_file $CLASS_DISK3 + +log_must zpool import -l -f -d $IMPORTDIR $TESTPOOL + +verify_all_directories +log_must zpool destroy $TESTPOOL + +# Create a non-special_failsafe pool, split it, and verify the split pool is +# also not special_failsafe. +log_must zpool create -o special_failsafe=off $TESTPOOL mirror \ + $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \ + mirror $CLASS_DISK2 $CLASS_DISK3 + +log_must zpool split $TESTPOOL $newpool +check_pool_alloc_class_props +log_must zpool destroy $TESTPOOL +log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL +check_pool_alloc_class_props +log_must zpool destroy $TESTPOOL + +log_pass $claim