diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 57170c8ae717..dc71dc5e7912 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -1164,6 +1164,23 @@ zpool_do_add(int argc, char **argv)
}
}
+ /*
+ * Special case:
+ *
+ * We need to know the special_failsafe pool property value to determine
+ * if the new vdev configuration has the correct redundancy requirements
+ * for special and dedup vdevs.
+ *
+ * Pass in the current value for special_failsafe to the proplist.
+ */
+ char strval[ZFS_MAXPROPLEN];
+ if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
+ ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
+ verify(add_prop_list(
+ zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
+ &props, B_TRUE) == 0);
+ }
+
/* pass off to make_root_vdev for processing */
nvroot = make_root_vdev(zhp, props, !check_inuse,
check_replication, B_FALSE, dryrun, argc, argv);
@@ -6940,6 +6957,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
}
}
+ /*
+ * Special case:
+ *
+ * We need to know the special_failsafe pool property value to determine
+ * if the new vdev configuration has the correct redundancy requirements
+ * for special and dedup vdevs.
+ *
+ * Pass in the current value for special_failsafe to the proplist.
+ */
+ char strval[ZFS_MAXPROPLEN];
+ if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
+ ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
+ verify(add_prop_list(
+ zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
+ &props, B_TRUE) == 0);
+ }
+
nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
argc, argv);
if (nvroot == NULL) {
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index fbd4b81dfacc..cd2996953084 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -85,6 +85,7 @@
*/
boolean_t error_seen;
boolean_t is_force;
+boolean_t is_alloc_class;
void
vdev_error(const char *fmt, ...)
@@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...)
if (!error_seen) {
(void) fprintf(stderr, gettext("invalid vdev specification\n"));
if (!is_force)
- (void) fprintf(stderr, gettext("use '-f' to override "
- "the following errors:\n"));
+ if (is_alloc_class) {
+ (void) fprintf(stderr, gettext("Turn on the "
+ "special_failsafe pool property or use '-f'"
+ " to override the following errors:\n"));
+ is_alloc_class = B_FALSE;
+ } else {
+ (void) fprintf(stderr, gettext("use '-f' to "
+ "override the following errors:\n"));
+ }
else
(void) fprintf(stderr, gettext("the following errors "
"must be manually repaired:\n"));
@@ -442,6 +450,7 @@ typedef struct replication_level {
const char *zprl_type;
uint64_t zprl_children;
uint64_t zprl_parity;
+ boolean_t zprl_is_alloc_class;
} replication_level_t;
#define ZPOOL_FUZZ (16 * 1024 * 1024)
@@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b)
return (B_FALSE);
}
+/*
+ * Return true if 'props' contains:
+ *
+ * special_failsafe=on
+ *
+ * ... and feature@special_failsafe is NOT disabled.
+ */
+static boolean_t
+is_special_failsafe_enabled_in_props(nvlist_t *props)
+{
+ const char *str = NULL;
+
+ if (nvlist_lookup_string(props, "feature@special_failsafe",
+ &str) == 0) {
+ if ((str != NULL) && strcmp(str, "disabled") == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE),
+ &str) == 0) {
+ if ((str != NULL) && strcmp(str, "on") == 0) {
+ return (B_TRUE); /* It is enabled */
+ }
+ }
+
+ return (B_FALSE);
+}
+
/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev.
*/
static replication_level_t *
-get_replication(nvlist_t *nvroot, boolean_t fatal)
+get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal)
{
nvlist_t **top;
uint_t t, toplevels;
@@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
nvlist_t *nv;
const char *type;
replication_level_t lastrep = {0};
- replication_level_t rep;
+ replication_level_t rep = {0};
replication_level_t *ret;
replication_level_t *raidz, *mirror;
boolean_t dontreport;
@@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
for (t = 0; t < toplevels; t++) {
uint64_t is_log = B_FALSE;
+ const char *str = NULL;
nv = top[t];
@@ -528,12 +568,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
strcmp(type, VDEV_TYPE_INDIRECT) == 0)
continue;
+ rep.zprl_type = type;
+
+ /*
+ * If special_failsafe=on then we know the special allocation
+ * class devices have at least one copy of their data on the
+ * pool so we can ignore their replication level.
+ */
+ (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &str);
+ if (str &&
+ ((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) ||
+ (strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) {
+ rep.zprl_is_alloc_class = B_TRUE;
+ if (is_special_failsafe_enabled_in_props(props)) {
+ continue; /* We're backed up, skip redundancy */
+ }
+ }
+
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) {
/*
* This is a 'file' or 'disk' vdev.
*/
- rep.zprl_type = type;
rep.zprl_children = 1;
rep.zprl_parity = 0;
} else {
@@ -548,7 +605,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* We also check that the size of each vdev (if it can
* be determined) is the same.
*/
- rep.zprl_type = type;
rep.zprl_children = 0;
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
@@ -808,7 +864,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* report any difference between the two.
*/
static int
-check_replication(nvlist_t *config, nvlist_t *newroot)
+check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot)
{
nvlist_t **child;
uint_t children;
@@ -825,7 +881,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- if ((current = get_replication(nvroot, B_FALSE)) == NULL)
+ if ((current = get_replication(props, nvroot, B_FALSE)) == NULL)
return (0);
}
/*
@@ -850,17 +906,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
* Get the replication level of the new vdev spec, reporting any
* inconsistencies found.
*/
- if ((new = get_replication(newroot, B_TRUE)) == NULL) {
+ if ((new = get_replication(props, newroot, B_TRUE)) == NULL) {
free(current);
return (-1);
}
-
/*
* Check to see if the new vdev spec matches the replication level of
* the current pool.
*/
ret = 0;
if (current != NULL) {
+ if (current->zprl_is_alloc_class || new->zprl_is_alloc_class)
+ is_alloc_class = B_TRUE;
+ else
+ is_alloc_class = B_FALSE;
+
+ /*
+ * Special case:
+ * If there were any redundancy problems with alloc class vdevs
+ * BUT the pool had special_failsafe on, then we're fine since
+ * all the alloc class data has a copy in the main pool.
+ */
+ if (is_special_failsafe_enabled_in_props(props) &&
+ is_alloc_class)
+ goto out;
+
if (is_raidz_mirror(current, new, &raidz, &mirror) ||
is_raidz_mirror(new, current, &raidz, &mirror)) {
if (raidz->zprl_parity != mirror->zprl_children - 1) {
@@ -899,7 +969,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
ret = -1;
}
}
-
+out:
free(new);
if (current != NULL)
free(current);
@@ -1888,7 +1958,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
* found. We include the existing pool spec, if any, as we need to
* catch changes against the existing replication level.
*/
- if (check_rep && check_replication(poolconfig, newroot) != 0) {
+ if (check_rep && check_replication(props, poolconfig, newroot) != 0) {
nvlist_free(newroot);
return (NULL);
}
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index e191420f2d2d..d83d5defa7ee 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -258,6 +258,7 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO,
+ ZPOOL_PROP_SPECIAL_FAILSAFE,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -1610,6 +1611,7 @@ typedef enum {
ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
+ ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE,
} zfs_errno_t;
/*
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3073c4d1b937..8d02dc8d5dac 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1117,7 +1117,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
-extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing,
+ uint64_t missing_special);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 5605a35b8641..e5e61baeee10 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -336,6 +336,13 @@ struct spa {
uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
+ /*
+ * Number of 'spa_missing_tvds' that are alloc class devices
+ * in the pool that has special_failsafe on, and are thus recoverable
+ * from errors.
+ */
+ uint64_t spa_missing_recovered_tvds;
+
uint64_t spa_nonallocating_dspace;
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;
@@ -474,6 +481,9 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */
+
+ /* Backup special/dedup devices data to the pool */
+ boolean_t spa_special_failsafe;
};
extern char *spa_config_path;
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 57ff31e89eb9..47fc643d9c53 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -640,6 +640,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd);
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
+extern boolean_t vdev_is_leaf(vdev_t *vd);
+extern boolean_t vdev_is_special(vdev_t *vd);
+extern boolean_t vdev_is_dedup(vdev_t *vd);
+extern boolean_t vdev_is_alloc_class(vdev_t *vd);
+extern boolean_t vdev_is_special_failsafe(vdev_t *vd);
/*
* Vdev ashift optimization tunables
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 2515ba321759..be74255b31c6 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
+ SPA_FEATURE_SPECIAL_FAILSAFE,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 80f4b7439a55..10ec8c7eda12 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -607,7 +607,7 @@
-
+
@@ -2921,7 +2921,8 @@
-
+
+
@@ -5963,7 +5964,8 @@
-
+
+
@@ -9025,8 +9027,8 @@
-
-
+
+
@@ -9103,7 +9105,7 @@
-
+
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 73ae0950ccb6..378de5a6f8ee 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -774,6 +774,15 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break;
+ case ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "Cannot set pool prop special_failsafe=on since "
+ "feature@special_failsafe is not set to 'enabled'.\n"
+ "This could be because the special_failsafe pool prop was "
+ "manually turned off while the special_failsafe feature "
+ "flag was active, or the feature flag was disabled."));
+ zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
+ break;
default:
zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
index 06705ff4d9b4..6e349920d21f 100644
--- a/lib/libzutil/zutil_import.c
+++ b/lib/libzutil/zutil_import.c
@@ -1924,7 +1924,7 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,
/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */
static boolean_t
-vdev_is_leaf(nvlist_t *nv)
+vdev_is_leaf_nv(nvlist_t *nv)
{
uint_t children = 0;
nvlist_t **child;
@@ -1937,10 +1937,10 @@ vdev_is_leaf(nvlist_t *nv)
/* Return if a vdev is a leaf vdev and a real device (disk or file) */
static boolean_t
-vdev_is_real_leaf(nvlist_t *nv)
+vdev_is_real_leaf_nv(nvlist_t *nv)
{
const char *type = NULL;
- if (!vdev_is_leaf(nv))
+ if (!vdev_is_leaf_nv(nv))
return (B_FALSE);
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type);
@@ -1973,7 +1973,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,
/* The very first entry in the NV list is a special case */
if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) {
- if (real_leaves_only && !vdev_is_real_leaf(nv))
+ if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0);
*((nvlist_t **)last_nv) = nv;
@@ -1996,7 +1996,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,
* we want.
*/
if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) {
- if (real_leaves_only && !vdev_is_real_leaf(nv))
+ if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0);
*((nvlist_t **)last_nv) = nv;
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index ea3c68dc6083..9316f7983336 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -322,6 +322,40 @@ With device removal, it can be returned to the
.Sy enabled
state if all the dedicated allocation class vdevs are removed.
.
+.feature org.zfsonlinux special_failsafe yes allocation_classes
+This feature allows the
+.Sy special_failsafe
+pool property to be used.
+When the
+.Sy special_failsafe
+pool property is set to "on" all proceeding writes to allocation class vdevs
+(like special and dedup vdevs) will also generate an additional copy of the data
+to be written to the pool.
+This allows alloc class vdev data to be "backed up" to the pool.
+A fully backed up allocation device vdev can fail without causing the pool to be
+suspended, even if the alloc class device is not redundant.
+.Pp
+It is important to note the difference between the
+.Sy special_failsafe
+feature flag and a
+.Sy special_failsafe
+pool property since they appear similar.
+The
+.Sy special_failsafe
+feature flag is a safeguard to prevent a pool that is using special_failsafe
+from being imported read/write on an older version of ZFS that does not support
+special_failsafe (and possibly compromising the integrity of the backup
+guarantees).
+The pool property is what actually allows you to turn on/off the backup copy
+writes.
+The
+.Sy special_failsafe
+feature will switch from "enabled" to "active" when allocation class devices
+are added.
+See the
+.Sy special_failsafe
+pool property for more details.
+.
.feature com.delphix async_destroy yes
Destroying a file system requires traversing all of its data in order to
return its used space to the pool.
diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7
index 18dfca6dc8ac..5e6b2c0e0db4 100644
--- a/man/man7/zpoolconcepts.7
+++ b/man/man7/zpoolconcepts.7
@@ -181,14 +181,18 @@ section.
.It Sy dedup
A device solely dedicated for deduplication tables.
The redundancy of this device should match the redundancy of the other normal
-devices in the pool.
+devices in the pool except if the
+.Sy special_failsafe
+pool property is enabled.
If more than one dedup device is specified, then
allocations are load-balanced between those devices.
.It Sy special
A device dedicated solely for allocating various kinds of internal metadata,
and optionally small file blocks.
The redundancy of this device should match the redundancy of the other normal
-devices in the pool.
+devices in the pool except if the
+.Sy special_failsafe
+pool property is enabled.
If more than one special device is specified, then
allocations are load-balanced between those devices.
.Pp
diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7
index 5428ab8d3076..129f8de52731 100644
--- a/man/man7/zpoolprops.7
+++ b/man/man7/zpoolprops.7
@@ -437,6 +437,34 @@ command, though this property can be used when a specific version is needed for
backwards compatibility.
Once feature flags are enabled on a pool this property will no longer have a
value.
+.It Sy special_failsafe Ns = Ns Sy on Ns | Ns Sy off
+Controls the special failsafe subsystem for special allocation
+class vdevs.
+When it's turned on, all writes to special allocation class vdevs
+(like 'special' and 'dedup' vdevs) will also write an additional copy of the
+data to the main pool.
+This allows alloc class vdev data to be "backed up" to the pool.
+When
+.Sy special_failsafe
+is turned on, alloc class vdevs can fail regardless of their redundancy level
+without the pool loosing data.
+To use
+.Sy special_failsafe
+simply turn it on at zpool create time, or turn it on prior to adding
+alloc class devices.
+It's important to note that after alloc class vdevs are added to the pool with
+.Sy special_failsafe
+on, you can still turn
+.Sy special_failsafe
+off again, but once it's off you can't turn it back on.
+.Sy special_failsafe
+can be freely toggled on/off if alloc class devices haven't been added to the
+pool, since the pool prop would have no effect.
+The
+.Sy feature@special_failsafe
+feature flag must be enabled in order to use the
+.Sy special_failsafe
+pool property.
.El
.
.Ss User Properties
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 309d9bf14cd4..a3583faa8195 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -753,6 +753,18 @@ zpool_feature_init(void)
"org.openzfs:raidz_expansion", "raidz_expansion",
"Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
+ {
+ static const spa_feature_t special_failsafe_deps[] = {
+ SPA_FEATURE_ALLOCATION_CLASSES,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_SPECIAL_FAILSAFE,
+ "org.openzfs:special_failsafe", "special_failsafe",
+ "Save a copy of allocation class device data to main pool",
+ ZFEATURE_FLAG_MOS,
+ ZFEATURE_TYPE_BOOLEAN, special_failsafe_deps,
+ sfeatures);
+ }
zfs_mod_list_supported_free(sfeatures);
}
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index e2e3bf5be69e..e767c0e3193e 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -153,6 +153,10 @@ zpool_prop_init(void)
zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
boolean_table, sfeatures);
+ zprop_register_index(ZPOOL_PROP_SPECIAL_FAILSAFE,
+ "special_failsafe", 0, PROP_DEFAULT, ZFS_TYPE_POOL,
+ "on | off", "SPECIAL_FAILSAFE", boolean_table,
+ sfeatures);
/* default index properties */
zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 7170b5eefcea..fa73f6c5da4f 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5848,10 +5848,22 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
dva_t *dva = bp->blk_dva;
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
+ boolean_t is_special_failsafe = B_FALSE;
+
+ if ((spa->spa_special_failsafe && ((mc == spa_special_class(spa)) ||
+ (mc == spa_dedup_class(spa))))) {
+ is_special_failsafe = B_TRUE;
+ }
ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+ /*
+ * Earlier layers of the code should set nvdas > 1 if the
+ * alloc class vdev is being backed up.
+ */
+ ASSERT(!(is_special_failsafe && ndvas == 1));
+
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
if (mc->mc_allocator[allocator].mca_rotor == NULL) {
@@ -5866,7 +5878,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
ASSERT3P(zal, !=, NULL);
for (int d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ metaslab_class_t *_mc;
+ if (is_special_failsafe && (d == 1)) {
+ /*
+ * If we have the special_failsafe prop set, then make
+ * the 2nd copy of the data we are going to write go to
+ * the regular pool rather than yet another copy to the
+ * alloc class device. That way, if the special device
+ * is lost, there's still a backup in the pool.
+ */
+ _mc = spa_normal_class(spa);
+ } else {
+ _mc = mc;
+ }
+
+ error = metaslab_alloc_dva(spa, _mc, psize, dva, d, hintdva,
txg, flags, zal, allocator);
if (error != 0) {
for (d--; d >= 0; d--) {
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 638572996c3a..cb574eefae7b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -477,6 +477,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
DNODE_MIN_SIZE, ZPROP_SRC_NONE);
}
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) {
+ zprop_source_t src;
+ if ((uint64_t)spa->spa_special_failsafe ==
+ zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE,
+ NULL, spa->spa_special_failsafe, src);
+ } else {
+ /* special_failsafe not used */
+ spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE,
+ NULL, B_FALSE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -610,6 +626,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
int error = 0, reset_bootfs = 0;
uint64_t objnum = 0;
boolean_t has_feature = B_FALSE;
+ boolean_t special_failsafe_prop = B_FALSE;
+
+ /*
+ * The way the feature flags work here are a little interesting.
+ *
+ * At zpool creation time, this feature will not be initialized yet when
+ * spa_prop_validate() gets called. This works out though, as the
+ * feature flag will be passed in the nvlist if the feature is enabled.
+ *
+ * After the pool is created, calls to this function (like zpool set)
+ * will not include the feature flag in the props nvlist, but the
+ * feature table will be initialized, so we can use
+ * spa_feature_is_active().
+ */
+ boolean_t special_failsafe_feature_disabled;
+ special_failsafe_feature_disabled = !(spa_feature_is_enabled(spa,
+ SPA_FEATURE_SPECIAL_FAILSAFE) || spa_feature_is_active(spa,
+ SPA_FEATURE_SPECIAL_FAILSAFE));
+
+ /* Did they explicitly pass feature@special_failsafe=enabled ? */
+ boolean_t special_failsafe_feature_passed = B_FALSE;
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
@@ -617,6 +654,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
const char *strval, *slash, *check, *fname;
const char *propname = nvpair_name(elem);
zpool_prop_t prop = zpool_name_to_prop(propname);
+ spa_feature_t fid = 0;
switch (prop) {
case ZPOOL_PROP_INVAL:
@@ -651,11 +689,30 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
}
fname = strchr(propname, '@') + 1;
- if (zfeature_lookup_name(fname, NULL) != 0) {
+ if (zfeature_lookup_name(fname, &fid) != 0) {
error = SET_ERROR(EINVAL);
break;
}
-
+ /*
+ * Special case - If both:
+ *
+ * SPA_FEATURE_SPECIAL_FAILSAFE = disabled
+ *
+ * ... and ...
+ *
+ * ZPOOL_PROP_SPECIAL_FAILSAFE = on
+ *
+ * then we need to fail. Note that the presence
+ * of SPA_FEATURE_SPECIAL_FAILSAFE in the
+ * nvlist means it is enabled (although its
+ * intval will be 0). If it's disabled, then
+ * SPA_FEATURE_SPECIAL_FAILSAFE will not
+ * be in the nvlist at all.
+ */
+ if (fid == SPA_FEATURE_SPECIAL_FAILSAFE) {
+ special_failsafe_feature_passed =
+ B_TRUE;
+ }
has_feature = B_TRUE;
} else {
error = SET_ERROR(EINVAL);
@@ -799,6 +856,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (strlen(strval) > ZPROP_MAX_COMMENT)
error = SET_ERROR(E2BIG);
break;
+ case ZPOOL_PROP_SPECIAL_FAILSAFE:
+ error = nvpair_value_uint64(elem, &intval);
+ if (!error && intval > 1)
+ error = SET_ERROR(EINVAL);
+ if (intval == 1)
+ special_failsafe_prop = B_TRUE;
+ break;
default:
break;
@@ -811,6 +875,26 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
(void) nvlist_remove_all(props,
zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
+ if (special_failsafe_prop && special_failsafe_feature_disabled &&
+ !special_failsafe_feature_passed) {
+ /*
+ * We can't enable SPECIAL_FAILSAFE pool prop if the
+ * feature flag SPA_FEATURE_SPECIAL_FAILSAFE is
+ * disabled.
+ */
+ error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE);
+ }
+
+ /*
+ * If the user wants to turn on the special_failsafe prop, but it
+ * was turned off (while the feature was active), then it can't be
+ * turned on again.
+ */
+ if (spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE) &&
+ !spa->spa_special_failsafe && special_failsafe_prop) {
+ error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE);
+ }
+
if (!error && reset_bootfs) {
error = nvlist_remove(props,
zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
@@ -2475,6 +2559,53 @@ spa_check_removed(vdev_t *vd)
}
}
+/*
+ * Decide what to do if we have missing/corrupted alloc class devices.
+ *
+ * If we have missing top-level vdevs and they are all alloc class devices with
+ * special_failsafe set, then we may still be able to import the pool.
+ */
+static int
+spa_check_for_bad_alloc_class_devices(spa_t *spa)
+{
+ if (spa->spa_missing_recovered_tvds == 0)
+ return (0);
+
+ /*
+ * Are there missing alloc class devices but
+ * SPA_FEATURE_SPECIAL_FAILSAFE is not enabled? If so,
+ * then we can't import.
+ */
+ if (!spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) {
+ spa_load_note(spa, "some alloc class devices are missing, "
+ "cannot import.");
+ return (SET_ERROR(ENXIO));
+ }
+
+ /*
+ * If all the missing top-level devices are alloc class devices, and
+ * if they have all their data backed up to the pool, then we can still
+ * import the pool.
+ */
+ if (spa->spa_missing_tvds > 0 &&
+ spa->spa_missing_tvds == spa->spa_missing_recovered_tvds) {
+ spa_load_note(spa, "only alloc class devices are missing, and "
+ "the normal pool has copies of the alloc class data, so "
+ "it's still possible to import.");
+ return (0);
+ }
+
+ /*
+ * If we're here, then it means that not all the missing top-level vdevs
+ * were alloc class devices. This should have been caught earlier.
+ */
+ spa_load_note(spa, "some alloc class devices that do not have a "
+ " special_failsafe backup copy are amongst those that are missing,"
+ " cannot import");
+
+ return (SET_ERROR(ENXIO));
+}
+
static int
spa_check_for_missing_logs(spa_t *spa)
{
@@ -3966,7 +4097,24 @@ spa_ld_open_vdevs(spa_t *spa)
error = vdev_open(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
- if (spa->spa_missing_tvds != 0) {
+ if (spa->spa_missing_tvds != 0 &&
+ spa->spa_missing_tvds == spa->spa_missing_recovered_tvds &&
+ (error == 0 || error == ENOENT)) {
+ /*
+ * Special case: If all the missing top-level vdevs are special
+ * devices, we may or may not be able to import the pool,
+ * depending on if the relevant special_failsafe feature and
+ * property are set. At this early stage of import we do not
+ * have the feature flags loaded yet, so for now proceed
+ * with the import. We will do the backup checks later after
+ * the feature flags are loaded.
+ */
+ spa_load_note(spa, "vdev tree has %lld missing special "
+ "top-level vdevs. Keep importing for now until we "
+ "can check the feature flags.",
+ (u_longlong_t)spa->spa_missing_tvds);
+ error = 0;
+ } else if (spa->spa_missing_tvds != 0) {
spa_load_note(spa, "vdev tree has %lld missing top-level "
"vdevs.", (u_longlong_t)spa->spa_missing_tvds);
if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
@@ -4726,6 +4874,7 @@ spa_ld_get_props(spa_t *spa)
if (error == 0) {
uint64_t autoreplace = 0;
+ uint64_t special_failsafe = 0;
spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
@@ -4734,7 +4883,11 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
+ spa_prop_find(spa, ZPOOL_PROP_SPECIAL_FAILSAFE,
+ &special_failsafe);
+
spa->spa_autoreplace = (autoreplace != 0);
+ spa->spa_special_failsafe = (special_failsafe != 0);
}
/*
@@ -5398,6 +5551,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
if (error != 0)
goto fail;
+ spa_import_progress_set_notes(spa, "Checking for bad alloc class "
+ "devices");
+ spa_check_for_bad_alloc_class_devices(spa);
+ if (error != 0)
+ return (error);
+
+
spa_import_progress_set_notes(spa, "Loading dedup tables");
error = spa_ld_load_dedup_tables(spa);
if (error != 0)
@@ -6589,6 +6749,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
+ /*
+ * Set initial special_failsafe settings. These may change after the
+ * nvlist properties are processed a little later in spa_sync_props().
+ */
+ spa->spa_special_failsafe = (boolean_t)
+ zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE);
+
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(props, tx);
@@ -9487,6 +9654,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
const char *elemname = nvpair_name(elem);
zprop_type_t proptype;
spa_feature_t fid;
+ boolean_t boolval;
switch (prop = zpool_name_to_prop(elemname)) {
case ZPOOL_PROP_VERSION:
@@ -9550,6 +9718,21 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
"%s=%s", nvpair_name(elem), strval);
break;
+ case ZPOOL_PROP_SPECIAL_FAILSAFE:
+ boolval = (boolean_t)fnvpair_value_uint64(elem);
+ spa->spa_special_failsafe = boolval;
+ /*
+ * Dirty the configuration on vdevs as above.
+ */
+ if (tx->tx_txg != TXG_INITIAL) {
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ spa_history_log_internal(spa, "set", tx,
+ "%s=%s", nvpair_name(elem), boolval ? "on" : "off");
+ break;
+
case ZPOOL_PROP_INVAL:
if (zpool_prop_feature(elemname)) {
fname = strchr(elemname, '@') + 1;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index d1d41bbe7214..74ef44a9edd9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -738,6 +738,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
+
+ spa->spa_special_failsafe = B_TRUE;
+
spa_set_deadman_failmode(spa, zfs_deadman_failmode);
spa_set_allocator(spa, zfs_active_allocator);
@@ -1682,6 +1685,9 @@ spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
*/
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+
+ if (spa->spa_special_failsafe)
+ spa_feature_incr(spa, SPA_FEATURE_SPECIAL_FAILSAFE, tx);
}
/*
@@ -2850,10 +2856,21 @@ spa_syncing_log_sm(spa_t *spa)
return (spa->spa_syncing_log_sm);
}
+/*
+ * Record the total number of missing top-level vdevs ('missing'), and the
+ * number of missing top-level vdevs that are recoverable ('missing_recovered').
+ * In this case, missing_recovered is the number of top-level alloc class vdevs
+ * that are recoverable since the special_failsafe pool prop was on, and thus
+ * their data is "backed up" to the main pool.
+ *
+ * The separate 'missing_recovered' count is used during pool import to
+ * determine if we can import a pool with missing alloc class vdevs.
+ */
void
-spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+spa_set_missing_tvds(spa_t *spa, uint64_t missing, uint64_t missing_recovered)
{
spa->spa_missing_tvds = missing;
+ spa->spa_missing_recovered_tvds = missing_recovered;
}
/*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index c74f72159dc9..a11b4d49597c 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -728,6 +728,60 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
return (vd);
}
+boolean_t
+vdev_is_leaf(vdev_t *vd)
+{
+ return (vd->vdev_children == 0);
+}
+
+/* Return true if vdev or TLD vdev is special alloc class */
+boolean_t
+vdev_is_special(vdev_t *vd)
+{
+ if (vd->vdev_alloc_bias == VDEV_BIAS_SPECIAL)
+ return (B_TRUE);
+
+ /*
+ * If the vdev is a leaf vdev, and is part of a mirror, its parent
+ * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_SPECIAL, but the
+ * leaf vdev itself will not. So we also need to check the parent
+ * in those cases.
+ */
+ if (vdev_is_leaf(vd) &&
+ (vd->vdev_parent != NULL && vdev_is_special(vd->vdev_parent))) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/* Return true if vdev or TLD vdev is dedup alloc class */
+boolean_t
+vdev_is_dedup(vdev_t *vd)
+{
+ if (vd->vdev_alloc_bias == VDEV_BIAS_DEDUP)
+ return (B_TRUE);
+
+ /*
+ * If the vdev is a leaf vdev, and is part of a mirror, it's parent
+ * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_DEDUP, but the
+ * leaf vdev itself will not. So we also need to check the parent
+ * in those cases.
+ */
+ if (vdev_is_leaf(vd) &&
+ (vd->vdev_parent != NULL && vdev_is_dedup(vd->vdev_parent))) {
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+boolean_t
+vdev_is_alloc_class(vdev_t *vd)
+{
+ return (vdev_is_special(vd) || vdev_is_dedup(vd));
+}
+
/*
* Allocate a new vdev. The 'alloctype' is used to control whether we are
* creating a new vdev or loading an existing one - the behavior is slightly
@@ -746,6 +800,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
int rc;
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
boolean_t top_level = (parent && !parent->vdev_parent);
+ const char *bias = NULL;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@@ -797,8 +852,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
if (top_level && alloctype == VDEV_ALLOC_ADD) {
- const char *bias;
-
/*
* If creating a top-level vdev, check for allocation
* classes input.
@@ -840,6 +893,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vd->vdev_tsd = tsd;
vd->vdev_islog = islog;
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+ }
+
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
@@ -3690,8 +3748,9 @@ vdev_load(vdev_t *vd)
VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
bias_str);
if (error == 0) {
- ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
- vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ if (vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias =
+ vdev_derive_alloc_bias(bias_str);
} else if (error != ENOENT) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -4150,7 +4209,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
* If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead.
*/
- if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+ if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
+ vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -4366,8 +4426,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
* don't allow it to be offlined. Log devices are always
* expendable.
*/
- if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
- vdev_dtl_required(vd))
+ if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
+ vd->vdev_aux == NULL && vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL,
SET_ERROR(EBUSY)));
@@ -4423,7 +4483,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
vd->vdev_offline = B_TRUE;
vdev_reopen(tvd);
- if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
+ vd->vdev_aux == NULL &&
vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE;
vdev_reopen(tvd);
@@ -5269,10 +5330,14 @@ vdev_propagate_state(vdev_t *vd)
* device, treat the root vdev as if it were
* degraded.
*/
- if (child->vdev_islog && vd == rvd)
+ if ((child->vdev_islog ||
+ vdev_is_special_failsafe(child)) &&
+ (vd == rvd)) {
degraded++;
- else
+ } else {
faulted++;
+ }
+
} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
degraded++;
}
@@ -5448,8 +5513,9 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
zfs_post_state_change(spa, vd, save_state);
}
- if (!isopen && vd->vdev_parent)
+ if (!isopen && vd->vdev_parent) {
vdev_propagate_state(vd->vdev_parent);
+ }
}
boolean_t
@@ -5517,6 +5583,24 @@ vdev_log_state_valid(vdev_t *vd)
return (B_FALSE);
}
+/*
+ * Is the vdev an alloc class vdev that is part of a pool that has
+ * special_failsafe on, and thus has all it's data backed up to the main pool?
+ *
+ * This function works for both top-level vdevs and leaf vdevs.
+ */
+boolean_t
+vdev_is_special_failsafe(vdev_t *vd)
+{
+ if (vdev_is_alloc_class(vd))
+ return (vd->vdev_spa->spa_special_failsafe);
+
+ if (vdev_is_leaf(vd) && vd->vdev_parent != NULL)
+ return (vdev_is_special_failsafe(vd->vdev_parent));
+
+ return (B_FALSE);
+}
+
/*
* Expand a vdev if possible.
*/
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index ed592514fded..5469409550ae 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -521,8 +521,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_removing);
}
- /* zpool command expects alloc class data */
- if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
const char *bias = NULL;
switch (vd->vdev_alloc_bias) {
@@ -539,6 +538,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
ASSERT3U(vd->vdev_alloc_bias, ==,
VDEV_BIAS_NONE);
}
+
fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
bias);
}
@@ -1804,9 +1804,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
spa_t *spa = svd[0]->vdev_spa;
zio_t *zio;
uint64_t good_writes = 0;
+ boolean_t failure_but_special_failsafe = B_FALSE;
+ int rc;
zio = zio_root(spa, NULL, NULL, flags);
-
for (int v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
@@ -1850,7 +1851,38 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
(void) zio_wait(zio);
- return (good_writes >= 1 ? 0 : EIO);
+ /*
+ * Special case:
+ *
+ * If we had zero good writes, but all the writes were to alloc class
+ * disks that were on a pool with special_failsafe on, then it's not
+ * fatal.
+ */
+ if (good_writes == 0) {
+ failure_but_special_failsafe = B_TRUE;
+ for (int v = 0; v < svdcount; v++) {
+ if (!vdev_is_special_failsafe(svd[v])) {
+ failure_but_special_failsafe = B_FALSE;
+ break;
+ }
+ }
+ }
+
+ if (good_writes >= 1) {
+ /* success */
+ rc = 0;
+ } else if (failure_but_special_failsafe) {
+ /*
+ * All the failures are on allocation class disks that were
+ * fully backed up to the pool, so this isn't fatal.
+ */
+ rc = 0;
+ } else {
+ /* failure */
+ rc = EIO;
+ }
+
+ return (rc);
}
/*
@@ -1966,7 +1998,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
zio_t *vio = zio_null(zio, spa, NULL,
- (vd->vdev_islog || vd->vdev_aux != NULL) ?
+ (vd->vdev_islog || vd->vdev_aux != NULL ||
+ vdev_is_special_failsafe(vd)) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
vdev_label_sync(vio, good_writes, vd, l, txg, flags);
@@ -2019,6 +2052,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
if (error != 0) {
if ((flags & ZIO_FLAG_TRYHARD) != 0)
return (error);
+
flags |= ZIO_FLAG_TRYHARD;
}
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index e132643dc330..3833bdf89d8d 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -32,6 +32,7 @@
#include
#include
#include
+#include
/*
* Virtual device vector for the pool's root vdev.
@@ -46,6 +47,7 @@ vdev_root_core_tvds(vdev_t *vd)
vdev_t *cvd = vd->vdev_child[c];
if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+ !vdev_is_special_failsafe(vd) &&
cvd->vdev_ops != &vdev_indirect_ops) {
tvds++;
}
@@ -87,6 +89,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
spa_t *spa = vd->vdev_spa;
int lasterror = 0;
int numerrors = 0;
+ int numerrors_recovered = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@@ -97,18 +100,25 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
-
if (cvd->vdev_open_error && !cvd->vdev_islog &&
cvd->vdev_ops != &vdev_indirect_ops) {
lasterror = cvd->vdev_open_error;
numerrors++;
+ if (vdev_is_special_failsafe(cvd))
+ numerrors_recovered++;
}
}
- if (spa_load_state(spa) != SPA_LOAD_NONE)
- spa_set_missing_tvds(spa, numerrors);
+ if (spa_load_state(spa) != SPA_LOAD_NONE) {
+ spa_set_missing_tvds(spa, numerrors, numerrors_recovered);
+ }
- if (too_many_errors(vd, numerrors)) {
+ if (numerrors != 0 && (numerrors == numerrors_recovered)) {
+ vdev_dbgmsg(vd, "there were %lu top-level errors, but they were"
+ " all alloc class vdevs with special_failsafe. Keep trying"
+ "to import.",
+ (long unsigned) numerrors);
+ } else if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d68d5ababe79..78033064f370 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3501,6 +3501,19 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+ /*
+ * Dedup writes can either to do a dedicated dedup device or to a
+ * dedicated special device. If we have special_failsafe on, we need
+ * to make an extra copy of the data to go on the pool. To do this
+ * we need to adjust the ZIO's copies here so the later stages in the
+ * ZIO pipeline work correctly.
+ */
+ if (spa->spa_special_failsafe && zp->zp_copies == 1) {
+ zp->zp_copies = 2;
+ }
+
+ p = zp->zp_copies;
+
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
ddp = &dde->dde_phys[p];
@@ -3631,6 +3644,22 @@ zio_dva_throttle(zio_t *zio)
mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
+ /*
+ * If the special_failsafe pool prop is enabled, we will do the regular
+ * write to the special/dedup device and an additional "backup"
+ * write to the normal pool. That way if the special/dedup devices
+ * all fail, we don't lose all data in our pool.
+ *
+ * Reserve that 2nd write to the regular pool here. The DVAs
+ * for both writes will later be allocated in the
+ * next step in the ZIO pipeline in
+ * zio_dva_allocate()->metaslab_alloc().
+ */
+ if ((spa->spa_special_failsafe && (mc == spa_special_class(spa) ||
+ mc == spa_dedup_class(spa))) && zio->io_prop.zp_copies == 1) {
+ zio->io_prop.zp_copies = 2;
+ }
+
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
!mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index ac2c541a9188..3e5566aa7e65 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -53,6 +53,14 @@ tags = ['functional', 'arc']
tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
tags = ['functional', 'atime']
+
+[tests/functional/special_failsafe]
+tests = ['special_failsafe_add', 'special_failsafe_create',
+ 'special_failsafe_files', 'special_failsafe_import',
+ 'special_failsafe_offline', 'special_failsafe_prop',
+ 'special_failsafe_scrub', 'special_failsafe_split']
+tags = ['functional', 'special_failsafe']
+
[tests/functional/bclone]
tests = ['bclone_crossfs_corner_cases_limited',
'bclone_crossfs_data',
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index dfab48d2cdaf..7ccdd9bf12bf 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -1081,6 +1081,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
}
+# Get the specified vdev property in parsable format or fail
+function get_vdev_prop
+{
+ typeset prop=$1
+ typeset pool=$2
+ typeset vdev=$3
+
+ zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
+}
+
# Return 0 if a pool exists; $? otherwise
#
# $1 - pool name
@@ -1815,7 +1825,8 @@ function verify_pool
function get_disklist # pool
{
echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \
- grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$")
+ grep -vEe '^-----' | \
+ grep -Ev '^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|-[0-9]$')
}
#
@@ -3907,3 +3918,28 @@ function pop_coredump_pattern
;;
esac
}
+
+# Get a list of all vdevs in the pool that are a certain type.
+#
+# The returned list is in a space-separated string, with the full path of each
+# vdev included:
+#
+# "/dev/sda /dev/sdb /dev/sdc"
+#
+# $1: Type of disk to get ('special', 'dedup', 'log', 'cache', 'spare')
+# $2: (optional) pool name
+function get_list_of_vdevs_that_are {
+ poolname=${2:-$TESTPOOL}
+
+ zpool status -P $poolname | sed -r '/\s+(mirror|draid|raidz)/d' | \
+ awk -v token="$1" '{
+ if (tmp == 1 && substr($1,1,1) == "/") {
+ if (first != 1) {
+ printf "%s", $1;
+ first=1;
+ } else {
+ printf " %s", $1;
+ }
+ } else {tmp=0}; if ($1 == token) {tmp=1}}
+ END {print ""}'
+}
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 44eedcf6fae5..42b0989907f4 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -90,6 +90,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
functional/alloc_class/alloc_class.kshlib \
functional/atime/atime.cfg \
functional/atime/atime_common.kshlib \
+ functional/special_failsafe/special_failsafe.cfg \
+ functional/special_failsafe/special_failsafe.kshlib \
functional/bclone/bclone.cfg \
functional/bclone/bclone_common.kshlib \
functional/bclone/bclone_corner_cases.kshlib \
@@ -441,6 +443,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/atime/root_atime_on.ksh \
functional/atime/root_relatime_on.ksh \
functional/atime/setup.ksh \
+ functional/special_failsafe/special_failsafe_add.ksh \
+ functional/special_failsafe/special_failsafe_create.ksh \
+ functional/special_failsafe/special_failsafe_files.ksh \
+ functional/special_failsafe/special_failsafe_import.ksh \
+ functional/special_failsafe/special_failsafe_prop.ksh \
+ functional/special_failsafe/special_failsafe_offline.ksh \
+ functional/special_failsafe/special_failsafe_scrub.ksh \
+ functional/special_failsafe/special_failsafe_split.ksh \
+ functional/special_failsafe/cleanup.ksh \
+ functional/special_failsafe/setup.ksh \
functional/bclone/bclone_crossfs_corner_cases.ksh \
functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
functional/bclone/bclone_crossfs_data.ksh \
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh
index 3237d7cb784f..4ea64f8318e6 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh
@@ -32,12 +32,16 @@ log_assert $claim
log_onexit cleanup
log_must disk_setup
-for type in special dedup; do
- log_mustnot zpool create -d $TESTPOOL $CLASS_DISK0 $type $CLASS_DISK1
+
+for arg in '-o special_failsafe=on' '' ; do
+ for type in special dedup; do
+ log_mustnot zpool create $args -d $TESTPOOL $CLASS_DISK0 $type \
+ $CLASS_DISK1
+ done
+ log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ log_must display_status "$TESTPOOL"
+ log_must zpool destroy -f "$TESTPOOL"
done
-log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
-log_must display_status "$TESTPOOL"
-log_must zpool destroy -f "$TESTPOOL"
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
index 78d40ce56d4e..7ab6552ebb0c 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh
@@ -31,27 +31,29 @@ log_onexit cleanup
log_must disk_setup
-for type in "" "mirror" "raidz"
-do
- log_must zpool create $TESTPOOL $type $ZPOOL_DISKS
-
- if [ "$type" = "mirror" ]; then
- log_must zpool add $TESTPOOL special mirror \
- $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
- elif [ "$type" = "raidz" ]; then
- log_must zpool add $TESTPOOL special mirror \
- $CLASS_DISK0 $CLASS_DISK1
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
- else
- log_must zpool add $TESTPOOL special $CLASS_DISK0
- log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
- fi
-
- log_must zpool destroy -f $TESTPOOL
+for arg in '-o special_failsafe=on' '' ; do
+ for type in "" "mirror" "raidz"
+ do
+ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS
+
+ if [ "$type" = "mirror" ]; then
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
+ elif [ "$type" = "raidz" ]; then
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+ else
+ log_must zpool add $TESTPOOL special $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ fi
+
+ log_must zpool destroy -f $TESTPOOL
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
index 04ce486adb83..131bf79ff306 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh
@@ -36,31 +36,35 @@ typeset ac_value
typeset stype=""
typeset sdisks=""
-for type in "" "mirror" "raidz"
-do
- if [ "$type" = "mirror" ]; then
- stype="mirror"
- sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
- elif [ "$type" = "raidz" ]; then
- stype="mirror"
- sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
- else
- stype=""
- sdisks="${CLASS_DISK0}"
- fi
+for arg in '-o special_failsafe=on' '' ; do
+ for type in "" "mirror" "raidz"
+ do
+ if [ "$type" = "mirror" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
+ elif [ "$type" = "raidz" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+ else
+ stype=""
+ sdisks="${CLASS_DISK0}"
+ fi
- log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \
- special $stype $sdisks
+ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS \
+ special $stype $sdisks
- ac_value="$(zpool get -H -o property,value all | awk '/allocation_classes/ {print $2}')"
- if [ "$ac_value" = "active" ]; then
- log_note "feature@allocation_classes is active"
- else
- log_fail "feature@allocation_classes not active, \
- status = $ac_value"
- fi
+ ac_value="$(zpool get -H -o property,value \
+ feature@allocation_classes | \
+ awk '/allocation_classes/ {print $2}')"
+ if [ "$ac_value" = "active" ]; then
+ log_note "feature@allocation_classes is active"
+ else
+ log_fail "feature@allocation_classes not active, \
+ status = $ac_value"
+ fi
- log_must zpool destroy -f $TESTPOOL
+ log_must zpool destroy -f $TESTPOOL
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh
index 08c703e21acb..6e74b0a6b465 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh
@@ -34,38 +34,44 @@ log_must disk_setup
typeset ac_value
-for type in "" "mirror" "raidz"
-do
- if [ "$type" = "mirror" ]; then
- log_must zpool create $TESTPOOL $type $ZPOOL_DISK0 $ZPOOL_DISK1
- else
- log_must zpool create $TESTPOOL $type $ZPOOL_DISKS
- fi
- ac_value="$(zpool get -H -o property,value all | \
- awk '/allocation_classes/ {print $2}')"
- if [ "$ac_value" = "enabled" ]; then
- log_note "feature@allocation_classes is enabled"
- else
- log_fail "feature@allocation_classes not enabled, \
- status = $ac_value"
- fi
+for arg in '-o special_failsafe=on' '' ; do
+ for type in "" "mirror" "raidz"
+ do
+ if [ "$type" = "mirror" ]; then
+ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISK0 \
+ $ZPOOL_DISK1
+ else
+ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS
+ fi
+ ac_value="$(zpool get -H -o property,value \
+ feature@allocation_classes | \
+ awk '/allocation_classes/ {print $2}')"
+ if [ "$ac_value" = "enabled" ]; then
+ log_note "feature@allocation_classes is enabled"
+ else
+ log_fail "feature@allocation_classes not enabled, \
+ status = $ac_value"
+ fi
- if [ "$type" = "" ]; then
- log_must zpool add $TESTPOOL special $CLASS_DISK0
- else
- log_must zpool add $TESTPOOL special mirror \
- $CLASS_DISK0 $CLASS_DISK1
- fi
- ac_value="$(zpool get -H -o property,value all | \
- awk '/allocation_classes/ {print $2}')"
- if [ "$ac_value" = "active" ]; then
- log_note "feature@allocation_classes is active"
- else
- log_fail "feature@allocation_classes not active, \
- status = $ac_value"
- fi
+ if [ "$type" = "" ]; then
+ log_must zpool add $TESTPOOL special $CLASS_DISK0
+ else
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ fi
+ ac_value="$(zpool get -H -o property,value \
+ feature@allocation_classes | \
+ awk '/allocation_classes/ {print $2}')"
- log_must zpool destroy -f $TESTPOOL
+ if [ "$ac_value" = "active" ]; then
+ log_note "feature@allocation_classes is active"
+ else
+ log_fail "feature@allocation_classes not active, \
+ status = $ac_value"
+ fi
+
+ log_must zpool destroy -f $TESTPOOL
+ done
done
log_pass "Values of allocation_classes feature flag correct."
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh
index 5852b2876e89..fc20fea6d096 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh
@@ -32,10 +32,14 @@ log_onexit cleanup
log_must disk_setup
-log_must zpool create $TESTPOOL \
- mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
- special mirror $CLASS_DISK0 $CLASS_DISK1
-log_must zpool split $TESTPOOL split_pool
-log_must zpool destroy -f $TESTPOOL
+for arg in '-o special_failsafe=on' '' ; do
+ log_must zpool create $arg $TESTPOOL \
+ mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
+ special mirror $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool split $TESTPOOL split_pool
+ log_must zpool import -d $(dirname $CLASS_DISK1) split_pool
+ log_must zpool destroy -f $TESTPOOL
+ log_must zpool destroy -f split_pool
+done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
index 106a6d933aac..a08732e6248f 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh
@@ -31,11 +31,13 @@ log_onexit cleanup
log_must disk_setup
-log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \
- special mirror $CLASS_DISK0 $CLASS_DISK1
-log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2
-log_must sleep 10
-log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
-log_must zpool destroy -f $TESTPOOL
+for arg in '-o special_failsafe=on' '' ; do
+ log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS \
+ special mirror $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2
+ log_must sleep 10
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
+ log_must zpool destroy -f $TESTPOOL
+done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh
index f73fbbe38c9b..2ac1024e351d 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh
@@ -35,22 +35,24 @@ typeset special_type=""
typeset create_disks=""
typeset added_disks=""
-for type in "" "raidz"
-do
- if [ "$type" = "raidz" ]; then
- special_type="mirror"
- create_disks="${CLASS_DISK0} ${CLASS_DISK1}"
- added_disks="${CLASS_DISK2} ${CLASS_DISK3}"
- else
- special_type=""
- create_disks="${CLASS_DISK0}"
- added_disks="${CLASS_DISK1}"
- fi
- log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \
- special $special_type $create_disks
- log_must zpool add $TESTPOOL special $special_type $added_disks
- log_must zpool iostat $TESTPOOL $added_disks
- log_must zpool destroy -f $TESTPOOL
+for arg in '-o special_failsafe=on' '' ; do
+ for type in "" "raidz"
+ do
+ if [ "$type" = "raidz" ]; then
+ special_type="mirror"
+ create_disks="${CLASS_DISK0} ${CLASS_DISK1}"
+ added_disks="${CLASS_DISK2} ${CLASS_DISK3}"
+ else
+ special_type=""
+ create_disks="${CLASS_DISK0}"
+ added_disks="${CLASS_DISK1}"
+ fi
+ log_must zpool create $args$TESTPOOL $type $ZPOOL_DISKS \
+ special $special_type $create_disks
+ log_must zpool add $TESTPOOL special $special_type $added_disks
+ log_must zpool iostat $TESTPOOL $added_disks
+ log_must zpool destroy -f $TESTPOOL
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
index e8061fdabcbd..db9fa468eab2 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh
@@ -35,35 +35,39 @@ typeset stype=""
typeset sdisks=""
typeset props=""
-for type in "" "mirror" "raidz"
-do
- if [ "$type" = "mirror" ]; then
- stype="mirror"
- sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
- props="-o ashift=12"
- elif [ "$type" = "raidz" ]; then
- stype="mirror"
- sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
- else
- stype=""
- sdisks="${CLASS_DISK0}"
- fi
+for arg in '-o special_failsafe=on' '' ; do
+ for type in "" "mirror" "raidz"
+ do
+ if [ "$type" = "mirror" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
+ props="-o ashift=12"
+ elif [ "$type" = "raidz" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+ else
+ stype=""
+ sdisks="${CLASS_DISK0}"
+ fi
- #
- # 1/3 of the time add the special vdev after creating the pool
- #
- if [ $((RANDOM % 3)) -eq 0 ]; then
- log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS
- log_must zpool add ${props} $TESTPOOL special $stype $sdisks
- else
- log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS \
- special $stype $sdisks
- fi
+ #
+ # 1/3 of the time add the special vdev after creating the pool
+ #
+ if [ $((RANDOM % 3)) -eq 0 ]; then
+ log_must zpool create $arg ${props} $TESTPOOL $type \
+ $ZPOOL_DISKS
+ log_must zpool add ${props} $TESTPOOL special $stype \
+ $sdisks
+ else
+ log_must zpool create $arg ${props} $TESTPOOL $type \
+ $ZPOOL_DISKS special $stype $sdisks
+ fi
- log_must zpool export $TESTPOOL
- log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
- log_must display_status $TESTPOOL
- log_must zpool destroy -f $TESTPOOL
+ log_must zpool export $TESTPOOL
+ log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
+ log_must display_status $TESTPOOL
+ log_must zpool destroy -f $TESTPOOL
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
index cbf5cbf89bdc..913f03f72fcb 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh
@@ -32,19 +32,22 @@ log_onexit cleanup
log_must disk_setup
-log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
-
-for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072
-do
- log_must zfs set special_small_blocks=$value $TESTPOOL
- ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \
- awk '/special_small_blocks/ {print $3}')
- if [ "$ACTUAL" != "$value" ]
- then
- log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value!"
- fi
+for arg in '-o special_failsafe=on' '' ; do
+ log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+
+ for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072
+ do
+ log_must zfs set special_small_blocks=$value $TESTPOOL
+ ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \
+ awk '/special_small_blocks/ {print $3}')
+ if [ "$ACTUAL" != "$value" ]
+ then
+ log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value"
+ fi
+ done
+
+ log_must zpool destroy -f "$TESTPOOL"
done
-log_must zpool destroy -f "$TESTPOOL"
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
index 0be49b858758..ffc8b84468dc 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh
@@ -32,13 +32,17 @@ log_assert $claim
log_onexit cleanup
log_must disk_setup
-log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
-for value in 256 1025 33554432
-do
- log_mustnot zfs set special_small_blocks=$value $TESTPOOL
+for arg in '-o special_failsafe=on' '' ; do
+ log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+
+ for value in 256 1025 33554432
+ do
+ log_mustnot zfs set special_small_blocks=$value $TESTPOOL
+ done
+
+ log_must zpool destroy -f "$TESTPOOL"
done
-log_must zpool destroy -f "$TESTPOOL"
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
index 0b1c18bafdaf..16d25a3f282a 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh
@@ -25,20 +25,20 @@
verify_runnable "global"
#
-# Verify the file identified by the input is written on a special vdev
-# According to the pool layout used in this test vdev_id 3 and 4 are special
-# XXX: move this function to libtest.shlib once we get "Vdev Properties"
+# Given a dataset and an inode number, return a list of all the vdev numbers
+# that the inode has blocks on.
#
-function file_in_special_vdev #
+# For example, if the inode has blocks on vdevs 0, 1 and 2, this would return
+# the string "0 1 2"
+#
+function vdevs_file_is_on #
{
typeset dataset="$1"
typeset inum="$2"
- typeset num_normal=$(echo $ZPOOL_DISKS | wc -w)
- num_normal=${num_normal##* }
-
- zdb -dddddd $dataset $inum | awk -v d=$num_normal '{
+ zdb -dddddd $dataset $inum | awk '
+/L0 [0-9]+/{
# find DVAs from string "offset level dva" only for L0 (data) blocks
-if (match($0,"L0 [0-9]+")) {
+# if (match($0,"L0 [0-9]+")) {
dvas[0]=$3
dvas[1]=$4
dvas[2]=$5
@@ -50,25 +50,46 @@ if (match($0,"L0 [0-9]+")) {
print "Error parsing DVA: <" dva ">";
exit 1;
}
- # verify vdev is "special"
- if (arr[1] < d) {
- exit 1;
- }
+ count[arr[1]]++;
}
}
-}}'
+#}
+}
+END {
+ # Print out the unique vdev numbers that had data
+ firstprint=1;
+ for (i in count) {
+ if (firstprint==1) {
+ printf("%d", i);
+ firstprint=0;
+ } else {
+ printf(" %d", i);
+ }
+ }
+}
+'
}
#
# Check that device removal works for special class vdevs
#
+# $1: Set to 1 to backup alloc class data to the pool. Leave blank to disable
+# backup.
function check_removal
{
+ typeset backup
+ if [ "$1" == "1" ] ; then
+ backup=1
+ args="-o special_failsafe=on"
+ else
+ backup=0
+ args=""
+ fi
+
#
# Create a non-raidz pool so we can remove top-level vdevs
#
- log_must disk_setup
- log_must zpool create $TESTPOOL $ZPOOL_DISKS \
+ log_must zpool create $args $TESTPOOL $ZPOOL_DISKS \
special $CLASS_DISK0 special $CLASS_DISK1
log_must display_status "$TESTPOOL"
@@ -93,19 +114,49 @@ function check_removal
for i in 1 2 3 4; do
dataset="$TESTPOOL/$TESTFS"
inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)"
- log_must file_in_special_vdev $dataset $inum
+
+ # Get a list of all the vdevs 'testfile.$i' has blocks on.
+ # The list will be string like "0 1 2 3" if the blocks are on
+ # vdevs 0-3.
+ on_vdevs="$(vdevs_file_is_on $dataset $inum)"
+
+ # Get the number of normal (non-special) pool disks
+ num_pool_disks=$(echo $ZPOOL_DISKS | wc -w)
+ num_pool_disks=${num_pool_disks##* }
+
+ if [ "$backup" == "1" ] ; then
+ # Data should be on all vdevs (both pool and special
+ # devices).
+ lowest_data_disk=0
+ highest_data_disk=$(($num_pool_disks + 1))
+ else
+
+ # Data should only be on special devices
+ lowest_data_disk=$num_pool_disks
+ highest_data_disk=$(($lowest_data_disk + 1))
+ fi
+
+ # Get the starting disks that we expect the data to be on.
+ # We assume two special devices are attached to the pool.
+ # Disk numbers start at zero.
+ expected_on_vdevs="$(seq -s ' ' $lowest_data_disk $highest_data_disk)"
+
+ # Compare the disks we expect to see the blocks on with
+ # the actual disks they're on.
+ if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then
+ # Data distribution is not what we expected, break out of
+ # the loop so we can properly tear down the pool. We will
+ # error out after the loop.
+ break;
+ fi
done
log_must zpool remove $TESTPOOL $CLASS_DISK0
-
- sleep 5
- sync_pool $TESTPOOL
- sleep 1
-
- log_must zdb -bbcc $TESTPOOL
- log_must zpool list -v $TESTPOOL
log_must zpool destroy -f "$TESTPOOL"
- log_must disk_cleanup
+
+ if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then
+ log_fail "Expected data on disks $expected_on_vdevs, got $on_vdevs"
+ fi
}
claim="Removing a special device from a pool succeeds."
@@ -113,12 +164,15 @@ claim="Removing a special device from a pool succeeds."
log_assert $claim
log_onexit cleanup
-typeset CLASS_DEVSIZE=$CLASS_DEVSIZE
-for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do
- typeset ZPOOL_DISKS=$ZPOOL_DISKS
- for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do
- check_removal
+log_must disk_setup
+for backup in "1" "" ; do
+ typeset CLASS_DEVSIZE=$CLASS_DEVSIZE
+ for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do
+ typeset ZPOOL_DISKS=$ZPOOL_DISKS
+ for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do
+ check_removal $backup
+ done
done
done
-
+log_must disk_cleanup
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh
index 624cab88af0c..789bf816eabb 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh
@@ -33,31 +33,34 @@ log_onexit cleanup
# Create a non-raidz pool so we can remove top-level vdevs
#
log_must disk_setup
-log_must zpool create $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0
-log_must display_status "$TESTPOOL"
-#
-# Generate some dedup data in the dedup class before removal
-#
+for arg in '-o special_failsafe=on' '' ; do
+ log_must zpool create $arg $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0
+ log_must display_status "$TESTPOOL"
-log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL
-block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL"
-log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null"
+ #
+ # Generate some dedup data in the dedup class before removal
+ #
-sync_pool
-log_must zpool list -v $TESTPOOL
+ log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL
+ block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL"
+ log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null"
-#
-# remove a dedup allocation vdev
-#
-log_must zpool remove $TESTPOOL $CLASS_DISK0
+ sync_pool
+ log_must zpool list -v $TESTPOOL
+
+ #
+ # remove a dedup allocation vdev
+ #
+ log_must zpool remove $TESTPOOL $CLASS_DISK0
-sleep 5
-sync_pool $TESTPOOL
-sleep 1
+ sleep 5
+ sync_pool $TESTPOOL
+ sleep 1
-log_must zdb -bbcc $TESTPOOL
+ log_must zdb -bbcc $TESTPOOL
-log_must zpool destroy -f "$TESTPOOL"
+ log_must zpool destroy -f "$TESTPOOL"
+done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
index 1b52014fd2d9..aae7ecbe9568 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh
@@ -26,13 +26,15 @@ log_assert $claim
log_onexit cleanup
log_must disk_setup
-for size in 512 4096 32768 131072 524288 1048576
-do
- let bigger=$size*2
- log_mustnot zpool create -O recordsize=$size \
- -O special_small_blocks=$bigger \
- $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
+for arg in '-o special_failsafe=on' '' ; do
+ for size in 512 4096 32768 131072 524288 1048576
+ do
+ let bigger=$size*2
+ log_mustnot zpool create $arg -O recordsize=$size \
+ -O special_small_blocks=$bigger \
+ $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
index 49c468af6702..3922f8cb7bf9 100755
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh
@@ -26,20 +26,22 @@ log_assert $claim
log_onexit cleanup
log_must disk_setup
-for size in 8192 32768 131072 524288 1048576
-do
- let smaller=$size/2
- log_must zpool create -O recordsize=$size \
- -O special_small_blocks=$smaller \
- $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
- log_must zpool destroy -f "$TESTPOOL"
-
- log_must zpool create -O recordsize=$size \
- -O special_small_blocks=$size \
- $TESTPOOL raidz $ZPOOL_DISKS special mirror \
- $CLASS_DISK0 $CLASS_DISK1
- log_must zpool destroy -f "$TESTPOOL"
+for arg in '-o special_failsafe=on' '' ; do
+ for size in 8192 32768 131072 524288 1048576
+ do
+ let smaller=$size/2
+ log_must zpool create $arg -O recordsize=$size \
+ -O special_small_blocks=$smaller \
+ $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool destroy -f "$TESTPOOL"
+
+ log_must zpool create $arg -O recordsize=$size \
+ -O special_small_blocks=$size \
+ $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool destroy -f "$TESTPOOL"
+ done
done
log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 6ebce9459190..62388d7dbc72 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -61,6 +61,7 @@ typeset -a properties=(
"bcloneused"
"bclonesaved"
"bcloneratio"
+ "special_failsafe"
"feature@async_destroy"
"feature@empty_bpobj"
"feature@lz4_compress"
@@ -87,6 +88,7 @@ typeset -a properties=(
"feature@device_rebuild"
"feature@draid"
"feature@redaction_list_spill"
+ "feature@special_failsafe"
)
if is_linux || is_freebsd; then
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh
new file mode 100755
index 000000000000..5681caecfc52
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh
@@ -0,0 +1,27 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018, Delphix
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+verify_runnable "global"
+
+default_cleanup_noexit
+disk_cleanup
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh
new file mode 100755
index 000000000000..5c2e45c8dc2e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh
@@ -0,0 +1,24 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+verify_runnable "global"
+
+disk_cleanup
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg
new file mode 100644
index 000000000000..84200593eb38
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg
@@ -0,0 +1,36 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+export ZPOOL_DISK0="$TEST_BASE_DIR/device-0"
+export ZPOOL_DISK1="$TEST_BASE_DIR/device-1"
+export ZPOOL_DISK2="$TEST_BASE_DIR/device-2"
+export ZPOOL_DISKS="${ZPOOL_DISK0} ${ZPOOL_DISK1} ${ZPOOL_DISK2}"
+
+export CLASS_DISK0="$TEST_BASE_DIR/device-3"
+export CLASS_DISK1="$TEST_BASE_DIR/device-4"
+export CLASS_DISK2="$TEST_BASE_DIR/device-5"
+export CLASS_DISK3="$TEST_BASE_DIR/device-6"
+export CLASS_DISK4="$TEST_BASE_DIR/device-7"
+export CLASS_DISK5="$TEST_BASE_DIR/device-8"
+
+export CLASS_DISKS="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2} ${CLASS_DISK3} ${CLASS_DISK4} ${CLASS_DISK5}"
+
+export ZPOOL_DEVSIZE=200M
+export CLASS_DEVSIZE=200M
+
+export IMPORTDIR="$TEST_BASE_DIR"
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib
new file mode 100644
index 000000000000..21aa6acd9aca
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib
@@ -0,0 +1,255 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.cfg
+
+BACKUP_DIR=$TEST_BASE_DIR/backups
+
+function disk_setup
+{
+ truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS
+ truncate -s $CLASS_DEVSIZE $CLASS_DISKS
+
+ if [ -d $BACKUP_DIR ] ; then
+ log_fail "Existing $TEST_BASE_DIR/backups directory (maybe leftover from failed test run?)"
+ fi
+
+ mkdir -p $BACKUP_DIR
+}
+
+function disk_cleanup
+{
+ rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null
+ rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null
+
+ rm -f special_failsafe.key
+ rm -fr $BACKUP_DIR
+}
+
+function cleanup
+{
+ if datasetexists $TESTPOOL ; then
+ zpool destroy -f $TESTPOOL 2> /dev/null
+ fi
+
+ disk_cleanup
+}
+
+# Write zeros to an existing file, keeping the same size.
+function zero_file {
+ dd status=none if=/dev/zero of="$1" bs=$(stat_size "$1") count=1
+}
+
+# Write a verifiable file that will end up on a 'dedup' or 'special' vdev.
+# The filename will include the sha256 of the file for easy verification later.
+#
+# $1: Write type - "dedup" or "special"
+# $2: Path to directory to write the file to
+#
+# Note: we don't use log_must here since this can get really chatty and
+# we don't want to spam the logs. It will log_fail if there is an error.
+function write_verifiable_file {
+ class="$1"
+ writedir="$2"
+
+ if [[ "$class" == "dedup" ]] ; then
+ # Our dedup file size can be up to a megabyte-ish
+ filesize=$((32768 + ($RANDOM * $RANDOM % 1000000)))
+
+ # Make write a multiple of the recordsize for dedup
+ bs=32768
+ count=$(($filesize / $bs))
+
+ # Fill data with the letter 'a' for dedup
+ file_write -b $bs -c $count -d 'a' -o create -f $writedir/tmp || return
+ else
+ # Make all files less than the 32k special_small_blocks size we
+ # setup at dataset creation time
+ filesize=$((($RANDOM % 32767) + 1))
+ bs=$filesize
+ count=1
+ dd status=none if=/dev/urandom bs=$bs count=$count of="$writedir/tmp" || return
+ fi
+
+
+ csum=$(sha256digest "$writedir/tmp")
+ newfile=$csum.$class$totalwritten
+ mv "$writedir/tmp" "$writedir/$newfile"
+
+ # Basic sanity that we created our final file, and it has a non-zero size
+ expectedsize=$(($bs * $count))
+ actualsize=$(stat_size "$writedir/$newfile")
+ if [[ "$actualsize" != "$expectedsize" ]] || [[ "$actualsize" == "0" ]] ; then
+ log_fail "File $writedir/$newfile bad size $actualsize (expected $expectedsize)"
+ return
+ fi
+
+ totalwritten=$(($totalwritten + 1))
+}
+
+# Write some files to all our datasets.
+#
+# For each dataset:
+#
+# - 10 files should hit special vdevs
+# - 10 files should hit dedup vdevs
+function write_some_files {
+ typeset i
+ for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
+ for j in $(seq 1 10) ; do
+ write_verifiable_file special /$TESTPOOL/$i
+ write_verifiable_file dedup /$TESTPOOL/$i
+ done
+ done
+}
+
+# Given a directory containing only files created by write_verifiable_file(),
+# verify that the contents of the file match the sha256sum in the file's name.
+#
+# $1: Dir path with files to verify
+function verify_directory {
+ typeset verifydir="$1"
+ typeset i
+ for i in $(ls $verifydir) ; do
+
+ # Files will look like:
+ #
+ # ed324386045fa39d3f41d4f13c8c3e6a4698466e2b694c327f7e490be9e4e33f.dedup13
+ #
+ # Just grab the sha256 part
+
+ shaname="$(echo $i | cut -f1 -d'.')"
+ if [[ $(sha256digest "$verifydir/$i") != "$shaname" ]] ; then
+ log_fail "$verifydir/$i sha256 not $shaname"
+ false
+ return
+ fi
+ done
+ true
+}
+
+function backup_alloc_class_disks {
+ typeset i
+ for i in $@ ; do
+ cp ${i} $BACKUP_DIR/$(basename $i)
+ done
+}
+
+function restore_alloc_class_disks {
+ typeset i
+ for i in $@ ; do
+ mv $BACKUP_DIR/$(basename $i) ${i}
+ done
+}
+
+function zero_alloc_class_disks {
+ typeset i
+ for i in $@ ; do
+ zero_file "${i}"
+ done
+}
+
+# Create multiple datasets with different permutations of copies and encryption
+function special_failsafe_make_datasets {
+
+ log_must zfs create -o compression=off -o special_small_blocks=32K -o recordsize=32K \
+ -o dedup=on $TESTPOOL/$TESTFS
+
+ keyfile=$(pwd)/special_failsafe.key
+ dd if=/dev/urandom of=$keyfile bs=32 count=1
+
+ log_must zfs create -o copies=2 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
+ $TESTPOOL/2copies
+
+ log_must zfs create -o copies=3 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
+ $TESTPOOL/3copies
+
+ log_must zfs create -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
+ $TESTPOOL/encrypted
+
+ log_must zfs create -o copies=2 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
+ $TESTPOOL/encrypted2copies
+
+ log_must zfs create -o copies=3 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
+ $TESTPOOL/encrypted3copies
+}
+
+# For each dataset we created in special_failsafe_make_datasets, go though
+# and check that all the files in the datasets have the correct data.
+function verify_all_directories {
+ typeset i
+ for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
+ verify_directory /$TESTPOOL/$i
+ done
+
+ # ...we should also have the correct number of files
+ totalfiles=0
+ for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
+ totalfiles=$(($totalfiles + $(ls /$TESTPOOL/$i | wc -w)))
+ done
+
+ if [[ "$totalfiles" != "$totalwritten" ]] ; then
+ log_fail "Wrong file count: expected $totalwritten, got $totalfiles"
+ else
+ log_note "Verified $totalfiles files"
+ fi
+}
+
+# Return a space separated string of disks that are alloc class vdevs. Disk
+# names will include the full path.
+function get_list_of_alloc_class_disks {
+ typeset special_disks=$(get_list_of_vdevs_that_are "special")
+ typeset dedup_disks=$(get_list_of_vdevs_that_are "dedup")
+ typeset disks="$dedup_disks"
+
+ if [ -n "$special_disks" ] ; then
+ disks="$special_disks $disks"
+ fi
+
+ echo "$disks"
+}
+
+# Check that the pool/vdev proprieties and features for alloc class backups
+# are sane.
+function check_pool_alloc_class_props {
+ typeset special_failsafe_feature=$(get_pool_prop feature@special_failsafe $TESTPOOL)
+ typeset special_failsafe_prop=$(get_pool_prop special_failsafe $TESTPOOL)
+ if [ "$special_failsafe_feature" == "disabled" ] ; then
+ log_must [ "$special_failsafe_prop" == "off" ]
+ fi
+}
+
+# Simple function to check pool and vdev proprieties are what we expect. The
+# values we expect are passed to this function:
+#
+# $1: 'feature@special_failsafe' pool feature
+# $2: 'special_failsafe' pool prop
+#
+# This function will log_fail on error.
+function boilerplate_check {
+ typeset special_failsafe_feature=$1
+ typeset special_failsafe_prop=$2
+
+ if [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" != "$special_failsafe_feature" ] ; then
+ log_fail "feature@special_failsafe = $(get_pool_prop feature@special_failsafe $TESTPOOL), expected $special_failsafe_feature"
+ fi
+
+ if [ "$(get_pool_prop special_failsafe $TESTPOOL)" != "$special_failsafe_prop" ] ; then
+ log_fail "special_failsafe = $(get_pool_prop special_failsafe $TESTPOOL), expected $special_failsafe_prop"
+ fi
+}
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh
new file mode 100755
index 000000000000..36ff874cb00e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh
@@ -0,0 +1,96 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Verify that 'zpool add' and 'zpool attach' disks have the correct
+# special_failsafe settings.
+
+verify_runnable "global"
+
+claim="zpool add|attach disks have correct special_failsafe settings"
+
+log_assert $claim
+log_onexit cleanup
+
+# Try different pool configurations
+configs="mirror $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1
+mirror $ZPOOL_DISK0 $ZPOOL_DISK1 dedup mirror $CLASS_DISK0 $CLASS_DISK1"
+
+log_must disk_setup
+
+function do_test {
+ typeset config="$1"
+ typeset initial=$2
+ typeset new=$3
+
+ log_must zpool create -o special_failsafe=$initial $TESTPOOL $config
+ totalwritten=0
+
+ # Sanity check that feature@special_failsafe aligns with the
+ # pool prop
+ if [ $initial == "on" ] ; then
+ feature_expected="active"
+ else
+ feature_expected="enabled"
+ fi
+ boilerplate_check "$feature_expected" "$initial"
+
+ special_failsafe_make_datasets
+ write_some_files
+
+ if [ $initial != "off" ] ; then
+ log_must zpool set special_failsafe=$new $TESTPOOL
+ fi
+
+ write_some_files
+
+ # Now add a new special/dedup disk to the special mirror
+ log_must zpool attach $TESTPOOL $CLASS_DISK0 $CLASS_DISK2
+ write_some_files
+
+ # Add another special & dedup disk in RAID0 with the existing
+ # special mirror
+ log_must zpool add $TESTPOOL special $CLASS_DISK3
+ log_must zpool add $TESTPOOL dedup $CLASS_DISK4
+
+ write_some_files
+ verify_all_directories
+
+ log_must zpool export $TESTPOOL
+
+ alloc_class_disks="$(get_list_of_alloc_class_disks)"
+ zero_alloc_class_disks $alloc_class_disks
+
+ log_must zpool import -l -d $IMPORTDIR $TESTPOOL
+
+ verify_all_directories
+
+ log_must zpool destroy $TESTPOOL
+}
+
+# Create a pool that is initially not special_failsafe. Then, enable
+# special_failsafe and add/attach a disk.
+echo "$configs" | while read config ; do
+ for initial in "on" "off" ; do
+ for new in "on" "off" ; do
+ do_test "$config" $initial $new
+ done
+ done
+done
+
+cleanup
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh
new file mode 100755
index 000000000000..1905fba16073
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh
@@ -0,0 +1,86 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+# DESCRIPTION:
+# Verify 'zpool create' with different alloc class redundancy
+# levels will correctly succeed or fail.
+
+verify_runnable "global"
+
+claim="zpool create with different special_failsafe and disk permutations work"
+
+log_assert $claim
+log_onexit cleanup
+
+# These should always pass since they have same redundancy level
+configs_pass="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1
+mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup mirror $CLASS_DISK0 $CLASS_DISK1
+mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3"
+
+# These should always pass with special_failsafe enabled or when '-f' is passed.
+# They should fail otherwise.
+configs_pass_failsafe="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0
+mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup $CLASS_DISK0
+mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 dedup $CLASS_DISK2
+mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2"
+
+log_must disk_setup
+
+# Try configs with matching redundancy levels. They should all pass.
+echo "$configs_pass" | while read config ; do
+ log_must zpool create -o feature@special_failsafe=disabled $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -o special_failsafe=on $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -f -o special_failsafe=on $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+done
+
+# Try configs with lower redundancy level. They should fail if special_failsafe
+# is turned off and -f is not used.
+echo "$configs_pass_failsafe" | while read config ; do
+ log_mustnot zpool create -o feature@special_failsafe=disabled $TESTPOOL $config
+
+ log_must zpool create -o special_failsafe=on $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -f -o special_failsafe=on $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
+
+ log_must zpool create -f -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
+ log_must zpool destroy $TESTPOOL
+
+ log_mustnot zpool create -o feature@special_failsafe=enabled -o special_failsafe=off $TESTPOOL $config
+done
+
+cleanup
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh
new file mode 100755
index 000000000000..808df272a4c7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh
@@ -0,0 +1,124 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Test multiple different special_failsafe permutations. After each step
+# write a bunch of known files. Verify all files are present and correct
+# after all the steps are complete.
+
+verify_runnable "global"
+
+claim="Files on special_failsafe enabled disks do not get corrupted"
+
+log_assert $claim
+log_onexit cleanup
+
+# Try different pool configurations
+configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
+raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
+$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
+$ZPOOL_DISKS special $CLASS_DISK0
+$ZPOOL_DISKS dedup $CLASS_DISK0"
+
+echo "$configs" | while read config ; do
+ log_must disk_setup
+ log_must zpool create -o special_failsafe=on $TESTPOOL $config
+ totalwritten=0
+ special_failsafe_make_datasets
+
+ write_some_files
+ verify_all_directories
+
+ alloc_class_disks="$(get_list_of_alloc_class_disks)"
+ log_must zpool export $TESTPOOL
+
+ backup_alloc_class_disks $alloc_class_disks
+ zero_alloc_class_disks $alloc_class_disks
+
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+
+ # Our pool is imported but has all its special devices zeroed out. Try
+ # writing some files to it and export the pool
+ write_some_files
+
+ log_must zpool export $TESTPOOL
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+
+ write_some_files
+
+ log_must zpool export $TESTPOOL
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+
+ write_some_files
+
+ # Make our old disks appear again (which have older data). Do a zpool
+ # clear to make them come back online and resilver.
+ restore_alloc_class_disks $alloc_class_disks
+ log_must zpool clear $TESTPOOL
+
+ write_some_files
+
+ # At this point the pool should be normal. The next test is to
+ # corrupt the alloc class devices while the pool is running.
+ zero_alloc_class_disks $alloc_class_disks
+
+ # Trigger a scrub with our newly-zeroed alloc class disks
+ log_must zpool scrub $TESTPOOL
+
+ # The pool should be degraded, but still alive.
+ check_state $TESTPOOL "" "DEGRADED"
+
+ write_some_files
+
+ # Replace all the alloc class disks. This should get the pool
+ # back to normal.
+ for disk in $alloc_class_disks ; do
+ log_must zpool replace $TESTPOOL $disk
+ done
+
+ write_some_files
+
+ log_must zpool export $TESTPOOL
+
+ # Backup special disks, then totally remove them.
+ backup_alloc_class_disks $alloc_class_disks
+
+ rm -f $alloc_class_disks
+
+ # Try to import with the alloc class disks missing - it should work.
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+
+ # After all the pain we've put our pool though, it should still have all the
+ # correct file data.
+ log_must verify_all_directories
+
+ if [[ "$totalwritten" != "840" ]] ; then
+ log_fail "Didn't see 840 files, saw $totalwritten"
+ fi
+
+ # We've checked all the files. Do some more verifications.
+ verify_pool $TESTPOOL
+ verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR
+
+ # Record a few stats that show metadata re in use
+ zpool get dedup $TESTPOOL
+ zdb -bb $TESTPOOL 2>&1 | grep -Ei 'normal|special|dedup|ddt'
+
+ log_must zpool destroy $TESTPOOL
+ cleanup
+done
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh
new file mode 100755
index 000000000000..d8ba52c702b3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh
@@ -0,0 +1,93 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Verify we can import a special_failsafe pool even if all its alloc class
+# devices are missing.
+#
+verify_runnable "global"
+
+claim="Verify imports work on special_failsafe pools when vdevs missing"
+
+log_assert $claim
+log_onexit cleanup
+
+TWO_ZPOOL_DISKS="$ZPOOL_DISK0 $ZPOOL_DISK1"
+REPLACE_DISK="$ZPOOL_DISK2"
+
+# Try a bunch of different pool configurations
+configs="$TWO_ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
+raidz $TWO_ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
+$TWO_ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
+$TWO_ZPOOL_DISKS special $CLASS_DISK0
+$TWO_ZPOOL_DISKS dedup $CLASS_DISK0"
+
+function do_test {
+ typeset config="$1"
+ typeset action="$2"
+ typeset onoff="$3"
+
+ totalwritten=0
+ log_must disk_setup
+ log_must zpool create -o special_failsafe=$onoff $TESTPOOL $config
+
+ alloc_class_disks="$(get_list_of_alloc_class_disks)"
+
+ special_failsafe_make_datasets
+ write_some_files
+ verify_all_directories
+
+ log_must zpool export $TESTPOOL
+
+ # Backup alloc class disk before removing them
+ backup_alloc_class_disks $alloc_class_disks
+ if [ "$action" == "remove" ] ; then
+ rm -f $alloc_class_disks
+ else
+ zero_alloc_class_disks $alloc_class_disks
+ fi
+
+ # import should succeed or fail depending on how we're backed up
+ if [ "$onoff" == "on" ] ; then
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+ else
+ log_mustnot zpool import -l -d "$IMPORTDIR" $TESTPOOL
+
+ # With the disks restored, we should be able to import
+ restore_alloc_class_disks $alloc_class_disks
+ log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
+ fi
+ write_some_files
+
+ # Do a scrub and verify everything is correct
+ verify_pool $TESTPOOL
+
+ verify_all_directories
+
+ zpool destroy $TESTPOOL
+
+ cleanup
+}
+
+echo "$configs" | while read config ; do
+ for action in "remove" "zero" ; do
+ for onoff in "off" "on" ; do
+ do_test "$config" "$action" "$onoff"
+ done
+ done
+done
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh
new file mode 100755
index 000000000000..8f5722dfd8d0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh
@@ -0,0 +1,124 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Verify we can offline special_failsafe alloc class disks.
+# Verify we cannot offline non-special_failsafe alloc class disks.
+#
+verify_runnable "global"
+
+claim="Verify correct behavior when we force fault an alloc class disk"
+
+log_assert $claim
+log_onexit cleanup
+
+# Try a bunch of different pool configurations
+configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
+raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
+$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
+$ZPOOL_DISKS special $CLASS_DISK0
+$ZPOOL_DISKS dedup $CLASS_DISK0"
+
+function do_test {
+ prop="$1"
+ config="$2"
+ log_must disk_setup
+ log_must zpool create -f $prop $TESTPOOL $config
+ check_pool_alloc_class_props
+
+ special_failsafe_make_datasets
+ totalwritten=0
+ write_some_files
+
+ alloc_class_disks=$(get_list_of_alloc_class_disks)
+ alloc_class_disks_arr=($alloc_class_disks)
+
+ if [ "$prop" == "-o special_failsafe=on" ] ; then
+ log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "active" ]
+ else
+ log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "enabled" ]
+ fi
+
+ for ((i = 0; i < ${#alloc_class_disks_arr[@]}; i++)); do
+ disk="${alloc_class_disks_arr[$i]}"
+ if [ "$prop" == "-o special_failsafe=on" ] ; then
+ # Everything is backed-up. We should be able to
+ # offline all the disks.
+ log_must zpool offline $TESTPOOL $disk
+ log_must check_state $TESTPOOL "$disk" "OFFLINE"
+ log_must check_state $TESTPOOL "" "DEGRADED"
+ else
+ PARENT=$(get_vdev_prop parent $TESTPOOL $disk)
+ if [ "$PARENT" == "$TESTPOOL" ] ; then
+ # Leaf is TLD, offline should fail
+ log_mustnot zpool offline $TESTPOOL $disk
+ log_must check_state $TESTPOOL "$disk" "ONLINE"
+ log_must check_state $TESTPOOL "" "ONLINE"
+ else
+ # We're part of a mirror. We know all
+ # mirrors in our test pool are two disk
+ # so we should be able to offline the
+ # first disk, but not the second.
+ if [ "$i" == "0" ] ; then
+ # First alloc class disk - pretend
+ # "previous" disk was online to
+ # make things easy.
+ prev_online=1
+ else
+ if check_state $TESTPOOL "${alloc_class_disks_arr[$i - 1]}" "ONLINE" ; then
+ prev_online=1
+ else
+ prev_online=0
+ fi
+ fi
+
+ if [ "$prev_online" == "1" ] ; then
+ # First disk in mirror, can offline
+ log_must zpool offline $TESTPOOL $disk
+ log_must check_state $TESTPOOL "$disk" "OFFLINE"
+ log_must check_state $TESTPOOL "" "DEGRADED"
+ else
+ # Second disk in mirror, can't offline
+ # but we should still be in a pool
+ # degraded state from the first disk
+ # going offline.
+ log_mustnot zpool offline $TESTPOOL $disk
+ log_must check_state $TESTPOOL "$disk" "ONLINE"
+ log_must check_state $TESTPOOL "" "DEGRADED"
+ fi
+ fi
+ fi
+ done
+
+ write_some_files
+ verify_all_directories
+
+ # We've checked all the files. Do some more verifications.
+ verify_pool $TESTPOOL
+ verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR
+
+ zpool clear $TESTPOOL
+ zpool destroy $TESTPOOL
+ cleanup
+}
+
+for prop in "-o special_failsafe=on" "" ; do
+ echo "$configs" | while read config ; do
+ do_test "$prop" "$config"
+ done
+done
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh
new file mode 100755
index 000000000000..2c5c60251545
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh
@@ -0,0 +1,118 @@
+#!/bin/ksh -p
+
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Verify that special_failsafe prop does not work if
+# SPA_FEATURE_SPECIAL_FAILSAFE is disabled. Also, test upgrades.
+
+verify_runnable "global"
+
+claim="special_failsafe prop shouldn't work without SPA_FEATURE_SPECIAL_FAILSAFE"
+
+log_assert $claim
+log_onexit cleanup
+
+# Try a bunch of different pool configurations
+configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
+raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
+$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
+$ZPOOL_DISKS special $CLASS_DISK0
+$ZPOOL_DISKS dedup $CLASS_DISK0"
+
+# Make the pool disks smaller to make them quicker to back up. We don't use
+# much data on them.
+export ZPOOL_DEVSIZE=200M
+export CLASS_DEVSIZE=200M
+
+log_must disk_setup
+
+echo "$configs" | while read config ; do
+ # We should not be able to set special_failsafe=on if the feature
+ # flag is disabled.
+ log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=on $TESTPOOL $config
+
+ # Try a few permutations that should succeed
+ log_must zpool create -o special_failsafe=off $TESTPOOL $config
+ boilerplate_check "enabled" "off"
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -o special_failsafe=on $TESTPOOL $config
+ boilerplate_check "active" "on"
+ log_must zpool destroy $TESTPOOL
+
+ log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config
+ boilerplate_check "active" "on"
+ log_must zpool destroy $TESTPOOL
+done
+
+# Now let's do a multi-step test where we upgrade an older pool
+for cmd in "zpool set feature@special_failsafe=enabled $TESTPOOL" "zpool upgrade $TESTPOOL" ; do
+
+ # Make a pool with no special devices
+ log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL mirror $ZPOOL_DISKS
+ totalwritten=0
+
+ boilerplate_check "disabled" "off"
+ special_failsafe_make_datasets
+ write_some_files
+
+ # Test enabling the feature in two different ways:
+ #
+ # zpool set feature@special_failsafe=enabled ...
+ # zpool upgrade ...
+ #
+ log_must eval "$cmd"
+ boilerplate_check "enabled" "off"
+ write_some_files
+
+ # Shouldn't be able to add with special_failsafe prop off
+ log_mustnot zpool add $TESTPOOL special $CLASS_DISK0
+
+ log_must zpool set special_failsafe=on $TESTPOOL
+ boilerplate_check "enabled" "on"
+ write_some_files
+
+ log_must zpool add $TESTPOOL special $CLASS_DISK0
+
+ boilerplate_check "active" "on"
+
+ write_some_files
+
+ zpool add $TESTPOOL dedup $CLASS_DISK1
+
+ write_some_files
+
+ log_must zpool export $TESTPOOL
+ log_must zpool import -l -d $IMPORTDIR $TESTPOOL
+
+ verify_all_directories
+
+ # You should be able to turn special_failsafe off if it was on
+ log_must zpool set special_failsafe=off $TESTPOOL
+
+ boilerplate_check "active" "off"
+
+ # If special_failsafe prop was on and the feature active, and then you
+ # turned the prop off, you cannot turn it back on again.
+ log_mustnot zpool set special_failsafe=on $TESTPOOL
+
+ log_must zpool destroy $TESTPOOL
+done
+
+cleanup
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh
new file mode 100755
index 000000000000..7ccb32b7bf82
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh
@@ -0,0 +1,106 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Destroy alloc class disks and then do a scrub on both a
+# special_failsafe and non-special_failsafe pool. The special_failsafe
+# pool should only be DEGRADED, while the non-special_failsafe pool should
+# be SUSPENDED.
+
+verify_runnable "global"
+
+claim="special_failsafe pools survive a normally fatal scrub with bad disks"
+
+log_assert $claim
+log_onexit cleanup
+
+# Try different pool configurations
+configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
+raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
+$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
+$ZPOOL_DISKS special $CLASS_DISK0
+$ZPOOL_DISKS dedup $CLASS_DISK0"
+
+function do_test {
+ typeset config="$1"
+ typeset action="$2"
+ typeset onoff="$3"
+ totalwritten=0
+
+ log_must disk_setup
+ log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=$onoff $TESTPOOL $config
+
+ special_failsafe_make_datasets
+
+ totalwritten=0
+ write_some_files
+
+ # When we do a scrub later, we will either want it to suspend or not
+ # suspend the pool, depending on our backup settings. Make sure we are
+ # able to ride though the suspended pool so we # can continue with our
+ # tests.
+ log_must zpool set failmode=continue $TESTPOOL
+
+ alloc_class_disks="$(get_list_of_alloc_class_disks)"
+ backup_alloc_class_disks $alloc_class_disks
+ zero_alloc_class_disks $alloc_class_disks
+
+ # Spawn scrub into the background since the pool may be suspended and
+ # it will hang. We need to continue passed the hung scrub so we
+ # can restore the bad disks and do a 'zpool clear' to remove the
+ # suspended pool.
+ zpool scrub $TESTPOOL &
+
+ wait_scrubbed $TESTPOOL 3
+ if [ "$onoff" == "on" ] ; then
+ log_must check_state $TESTPOOL "" "DEGRADED"
+
+ verify_pool $TESTPOOL
+
+ write_some_files
+ verify_all_directories
+ else
+ log_must check_state $TESTPOOL "" "SUSPENDED"
+
+ # Pool should be suspended. Restore the old disks so we can
+ # clear the suspension. 'zpool clear' here will delete the
+ # pool.
+ restore_alloc_class_disks $alloc_class_disks
+ log_must zpool clear $TESTPOOL
+ fi
+
+ cleanup
+}
+
+# Stop zed in case we left it running from an old, aborted, test run.
+zed_stop
+zed_cleanup
+
+log_must zed_setup
+log_must zed_start
+log_must zed_events_drain
+
+# Verify scrubs work as expected with different permutations of special_failsafe
+echo "$configs" | while read config ; do
+ for i in "on" "off" ; do
+ do_test "$config" "zero" "$i"
+ done
+done
+
+log_must zed_stop
+log_must zed_cleanup
+
+log_pass $claim
diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh
new file mode 100755
index 000000000000..79a3008740fc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh
@@ -0,0 +1,94 @@
+#!/bin/ksh -p
+
+# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
+# Refer to the OpenZFS git commit log for authoritative copyright attribution.
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at .
+# You may not use this file except in compliance with the license.
+#
+# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
+
+. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
+
+#
+# DESCRIPTION:
+# Verify we can split a pool with special_failsafe, and the new pool
+# keeps the special_failsafe settings. Also verify the new pool has
+# all the data if the pool has special_failsafe.
+#
+verify_runnable "global"
+
+claim="zpool split works with special_failsafe"
+
+log_assert $claim
+log_onexit cleanup
+
+# Create a normal, special_failsafe pool
+log_must disk_setup
+log_must zpool create -o special_failsafe=on $TESTPOOL mirror \
+ $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \
+ mirror $CLASS_DISK2 $CLASS_DISK3
+
+totalwritten=0
+special_failsafe_make_datasets
+write_some_files
+verify_all_directories
+
+# Split the pool and verify the old pool has all the data
+newpool="${TESTPOOL}-2"
+
+log_must zpool split $TESTPOOL $newpool
+check_pool_alloc_class_props
+verify_all_directories
+
+# Forcefault alloc class devices on the old pool and verify we have all the
+# data.
+log_must zpool offline -f $TESTPOOL $CLASS_DISK0
+log_must zpool offline -f $TESTPOOL $CLASS_DISK2
+log_must check_state $TESTPOOL $CLASS_DISK0 "FAULTED"
+log_must check_state $TESTPOOL $CLASS_DISK2 "FAULTED"
+
+log_must check_state $TESTPOOL "" "DEGRADED"
+verify_all_directories
+
+log_must zpool clear $TESTPOOL
+
+# All done with the old pool
+log_must zpool destroy $TESTPOOL
+
+# Import the new split pool and rename it $TESTPOOL since all our verification
+# functions expect the pool to be called $TESTPOOL.
+log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL
+
+check_pool_alloc_class_props
+verify_all_directories
+
+# zero alloc class devices on the old pool and verify we have all the
+# data.
+log_must zpool export $TESTPOOL
+
+zero_file $CLASS_DISK1
+zero_file $CLASS_DISK3
+
+log_must zpool import -l -f -d $IMPORTDIR $TESTPOOL
+
+verify_all_directories
+log_must zpool destroy $TESTPOOL
+
+# Create a non-special_failsafe pool, split it, and verify the split pool is
+# also not special_failsafe.
+log_must zpool create -o special_failsafe=off $TESTPOOL mirror \
+ $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \
+ mirror $CLASS_DISK2 $CLASS_DISK3
+
+log_must zpool split $TESTPOOL $newpool
+check_pool_alloc_class_props
+log_must zpool destroy $TESTPOOL
+log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL
+check_pool_alloc_class_props
+log_must zpool destroy $TESTPOOL
+
+log_pass $claim