diff --git a/include/sys/spa.h b/include/sys/spa.h index 2e365eabe21d..4285aaa434f9 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1144,11 +1144,14 @@ extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); -extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); +extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx); +extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, + uint64_t old_head_ds, dmu_tx_t *tx); /* vdev cache */ extern void vdev_cache_stat_init(void); diff --git a/include/sys/zio.h b/include/sys/zio.h index 121b58dea58e..e09fa211dc51 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -291,6 +291,13 @@ struct zbookmark_phys { uint64_t zb_blkid; }; +typedef struct zbookmark_err_phys { + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; + uint64_t zb_birth; +} zbookmark_err_phys_t; + #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 874cbd9ff714..0d6634f196f8 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -75,6 +75,7 @@ typedef enum spa_feature { SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, SPA_FEATURE_DRAID, + SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 8a696206a56f..83eb751f3348 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -596,7 +596,7 @@ - + @@ -1856,8 +1856,8 @@ - - + + @@ -1896,7 +1896,8 @@ - + + @@ -1944,7 +1945,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index d10ca7f441f6..bf9bd259fcd0 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -832,7 +832,15 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. +.feature com.delphix head_errlog no +This feature enables the upgraded version of errlog. +This required an on-disk error format change. +Now the errorlog of each head dataset is stored separately +in the zap object and keyed by the head id. +With this feature enabled, every dataset affected by an error block is listed +in the output of +.Nm zpool Cm status . +This feature can only be enabled at pool creation time. .El -. .Sh SEE ALSO .Xr zpool 8 diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index da7454c118d7..ddfee47e23d7 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -594,6 +594,11 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DRAID, "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_HEAD_ERRLOG, + "com.delphix:head_errlog", "head_errlog", + "Introduce an on-disk block error log.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index f99964511aa6..925985d5208a 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3741,6 +3741,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); + + /* + * Transfer common error blocks from old head to new head. + */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t old_head = origin_head->ds_object; + uint64_t new_head = hds->ds_object; + spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); + } } /* diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index a2748197f29d..fbcdeba428c5 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1153,6 +1153,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } + /* Delete errlog. */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) + spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx); } void diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index fa5120eb61b3..7f048ef18411 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -53,7 +53,9 @@ #include #include #include - +#include +#include +#include /* * Convert a bookmark to a string. @@ -67,9 +69,20 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) } /* - * Convert a string to a bookmark + * Convert an err_phys to a string. */ +static void +errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, + (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); +} + #ifdef _KERNEL +/* + * Convert a string to a bookmark. + */ static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { @@ -82,8 +95,92 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } + +/* + * Convert a string to a err_phys. + */ +static void +name_to_errphys(char *buf, zbookmark_err_phys_t *zep) +{ + zep->zb_object = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_birth = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) +{ + zb->zb_objset = dataset; + zb->zb_object = zep->zb_object; + zb->zb_level = zep->zb_level; + zb->zb_blkid = zep->zb_blkid; +} #endif +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static int +get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, + uint64_t *head_dataset_id) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + dsl_pool_config_enter(dp, FTAG); + int err = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + return (SET_ERROR(EFAULT)); + } + ASSERT3P(head_dataset_id, !=, NULL); + *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (SET_ERROR(EFAULT)); + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep->zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (SET_ERROR(EFAULT)); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (err != 0 || BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (SET_ERROR(EFAULT)); + } + + zep->zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (0); +} + /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk @@ -128,6 +225,289 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) mutex_exit(&spa->spa_errlist_lock); } +#ifdef _KERNEL +static int +find_block_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg) +{ + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + return (SET_ERROR(ENOENT)); + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep->zb_object, FTAG, &dn) != 0) { + return (SET_ERROR(ENOENT)); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int err = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (err != 0 || BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (SET_ERROR(ENOENT)); + } + + *birth_txg = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (0); +} + +static int +check_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + uint64_t *count, void *addr, boolean_t only_count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, fs, FTAG, &ds) != 0) + return (SET_ERROR(EFAULT)); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + if (find_block_txg(ds, zep, &latest_txg) == 0) { + if (zep->zb_birth < latest_txg) { + txg_to_consider = latest_txg; + } else { + /* Block neither free nor re written. */ + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(fs, zep, &zb); + if (copyout(&zb, (char *)addr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + (*count)--; + } else { + (*count)++; + } + check_snapshot = B_FALSE; + } + } + + uint64_t snap_count; + if (zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count) != 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + + if (snap_count == 0) { + /* File system has no snapshot. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + KM_SLEEP); + + int aff_snap_count = 0; + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) + return (SET_ERROR(EFAULT)); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + break; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + affected = B_FALSE; + uint64_t blk_txg; + if (find_block_txg(ds, zep, &blk_txg) == 0 && + zep->zb_birth == blk_txg) { + affected = B_TRUE; + } + } + + if (affected) { + snap_obj_array[aff_snap_count] = snap_obj; + aff_snap_count++; + + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + if (copyout(&zb, (char *)addr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + goto out; + } + (*count)--; + } else { + (*count)++; + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + if (zap_clone != 0 && aff_snap_count > 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, spa->spa_meta_objset, zap_clone); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *clone; + int err = dsl_dataset_hold_obj(dp, za.za_first_integer, + FTAG, &clone); + + if (err != 0) { + zap_cursor_fini(&zc); + goto out; + } + + boolean_t found = B_FALSE; + for (int i = 0; i < aff_snap_count; i++) { + if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj + == snap_obj_array[i]) { + found = B_TRUE; + } + } + dsl_dataset_rele(clone, FTAG); + + if (!found) { + continue; + } + + err = check_filesystem(spa, za.za_first_integer, zep, + count, addr, only_count); + + if (err != 0) { + zap_cursor_fini(&zc); + goto out; + } + } + zap_cursor_fini(&zc); + } + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (0); +out: + dsl_dataset_rele(ds, FTAG); + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (SET_ERROR(EFAULT)); +} + +static uint64_t +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds) != 0) + return (SET_ERROR(EFAULT)); + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t top_affected_fs = head_ds; + + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg) { + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) + break; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + } + dsl_dataset_rele(ds, FTAG); + + return (top_affected_fs); +} +static int +process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *addr, boolean_t only_count) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_pool_config_enter(dp, FTAG); + uint64_t top_affected_fs = find_top_affected_fs(spa, head_ds, zep); + int error = check_filesystem(spa, top_affected_fs, zep, count, addr, + only_count); + dsl_pool_config_exit(dp, FTAG); + return (error); +} + +static uint64_t +get_errlog_size(spa_t *spa, uint64_t spa_err_obj) +{ + uint64_t total = 0; + if (spa_err_obj == 0) + return (total); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + if (process_error_block(spa, head_ds, &head_ds_block, + &total, NULL, B_TRUE) != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (total); + } + } + zap_cursor_fini(&head_ds_cursor); + } + zap_cursor_fini(&zc); + return (total); +} + +static uint64_t +get_errlist_size(spa_t *spa, avl_tree_t *tree) +{ + uint64_t total = 0; + + if (avl_numnodes(tree) == 0) + return (total); + + spa_error_entry_t *se; + for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + uint64_t head_ds_obj; + get_head_and_birth_txg(spa, &zep, se->se_bookmark.zb_objset, + &head_ds_obj); + + if (process_error_block(spa, head_ds_obj, &zep, &total, NULL, + B_TRUE) != 0) { + return (SET_ERROR(EFAULT)); + } + } + return (total); +} +#endif + /* * Return the number of errors currently in the error log. This is actually the * sum of both the last log and the current log, since we don't know the union @@ -136,83 +516,152 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) uint64_t spa_get_errlog_size(spa_t *spa) { - uint64_t total = 0, count; - - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); - - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); + uint64_t total = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t count; + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + } else { +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + total += get_errlog_size(spa, spa->spa_errlog_last); + total += get_errlog_size(spa, spa->spa_errlog_scrub); + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += get_errlist_size(spa, &spa->spa_errlist_last); + total += get_errlist_size(spa, &spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); +#endif + } return (total); } +/* + * If an error block is shared by two dataset it would be counted twice. For + * detailed message see spa_get_errlog_size() above. + */ + #ifdef _KERNEL static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +process_error_log(spa_t *spa, uint64_t obj, void *addr, uint64_t *count) { zap_cursor_t zc; zap_attribute_t za; - zbookmark_phys_t zb; if (obj == 0) return (0); - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + zbookmark_phys_t zb; + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + *count -= 1; - if (*count == 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); } + zap_cursor_fini(&zc); + return (0); + } - name_to_bookmark(za.za_name, &zb); + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds_err_obj = za.za_first_integer; + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + int err; + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + err = process_error_block(spa, head_ds, &head_ds_block, + count, addr, B_FALSE); + + if (err != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } } - - *count -= 1; + zap_cursor_fini(&head_ds_cursor); } - zap_cursor_fini(&zc); - return (0); } static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) +process_error_list(spa_t *spa, avl_tree_t *list, void *addr, uint64_t *count) { spa_error_entry_t *se; - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (se = avl_first(list); se != NULL; + se = AVL_NEXT(list, se)) { - if (*count == 0) - return (SET_ERROR(ENOMEM)); + if (*count == 0) + return (SET_ERROR(ENOMEM)); - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); - *count -= 1; + *count -= 1; + } + return (0); } + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + uint64_t head_ds_obj; + get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj); + + if (process_error_block(spa, head_ds_obj, &zep, + count, addr, B_FALSE) != 0) { + return (SET_ERROR(EFAULT)); + } + + } return (0); } #endif @@ -229,7 +678,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) * the error list lock when we are finished. */ int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; @@ -244,10 +693,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) mutex_enter(&spa->spa_errlist_lock); if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); @@ -304,28 +753,76 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) char buf[64]; void *cookie; - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); + if (avl_numnodes(t) == 0) { + return; + } + + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - /* add errors to the current log */ + /* add errors to the current log */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { char *name = se->se_name ? se->se_name : ""; bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, + strlen(name) + 1, name, tx); + } + } else { + for (se = avl_first(t); se != NULL; + se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + uint64_t head_dataset_obj; + get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_dataset_obj); + uint64_t err_obj; + int error = zap_lookup_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, &err_obj); + + if (error != 0) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, err_obj, tx); + } + errphys_to_name(&zep, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); + err_obj, buf, 1, strlen(name) + 1, name, tx); } + } + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); +} - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); +static void +delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx) == 0); + } + zap_cursor_fini(&zc); } + VERIFY(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx) == 0); } /* @@ -376,8 +873,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (scrub_finished) { if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); + delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; @@ -404,6 +900,134 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_errlog_lock); } +static void +delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, + dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + if (head_ds == ds) { + (void) zap_remove(spa->spa_meta_objset, spa_err_obj, + za.za_name, tx); + VERIFY(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx) == 0); + break; + } + } + zap_cursor_fini(&zc); +} + +void +spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); + delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + +#if defined(_KERNEL) +static uint64_t +find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, old_head, FTAG, &ds) != 0) + return (SET_ERROR(EFAULT)); + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (snap_obj != 0) { + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) + continue; + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj + == new_head) { + break; + } + + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + ASSERT(snap_obj != 0); + return (snap_obj_txg); +} + +static void +swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t + old_head, dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + uint64_t old_head_errlog; + int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, + old_head, &old_head_errlog); + + /* Check if file system being depromoted has errlog. */ + if (error != 0) + return; + + uint64_t txg = find_txg_ancestor_snapshot(spa, new_head, old_head); + + /* + * Check if file system being promoted already has errlog otherwise + * create zap object. + */ + uint64_t new_head_errlog; + error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, + &new_head_errlog); + + if (error != 0) { + new_head_errlog = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, + new_head, new_head_errlog, tx); + } + + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_err_phys_t err_block; + for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + char *name = ""; + name_to_errphys(za.za_name, &err_block); + if (err_block.zb_birth < txg) { + (void) zap_update(spa->spa_meta_objset, new_head_errlog, + za.za_name, 1, strlen(name) + 1, name, tx); + + (void) zap_remove(spa->spa_meta_objset, old_head_errlog, + za.za_name, tx); + } + } + zap_cursor_fini(&zc); +} +#endif + +void +spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, + dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); +#if defined(_KERNEL) + swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); + swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); +#endif + mutex_exit(&spa->spa_errlog_lock); +} + #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); @@ -413,4 +1037,6 @@ EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); +EXPORT_SYMBOL(spa_delete_dataset_errlog); +EXPORT_SYMBOL(spa_swap_errlog); #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index ca2da561220b..d8b262bab9d1 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5675,7 +5675,7 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; + uint64_t count = (uint64_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 89cd58c2aa32..fe795cd355cd 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -487,6 +487,7 @@ tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', + 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index accbf69cf9c8..c279f0311948 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -97,5 +97,6 @@ if is_linux || is_freebsd; then "feature@bookmark_v2" "feature@livelist" "feature@zstd_compress" + "feature@head_errlog" ) -fi \ No newline at end of file +fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am index 5553061c67b3..538c5d6c163c 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am @@ -4,4 +4,6 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_status_001_pos.ksh \ zpool_status_002_pos.ksh \ + zpool_status_003_pos.ksh \ + zpool_status_004_pos.ksh \ zpool_status_features_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh new file mode 100755 index 000000000000..da0611d4955a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2021, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify correct output with 'zpool status -v' after corrupting a file +# +# STRATEGY: +# 1. Create a file system +# 2. zinject checksum errors +# 3. Read the file +# 4. Take a snapshot and make a clone +# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' + +function cleanup +{ + log_must zinject -c all + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL +} + +verify_runnable "both" + +log_assert "Verify correct 'zpool status -v' output with a corrupted file" +log_onexit cleanup + +log_must mkfile 10m $TESTDIR/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +log_must zinject -t data -e checksum -f 100 $TESTDIR/10m_file + +# Try to read the 2nd megabyte of 10m_file +dd if=$TESTDIR/10m_file bs=1M skip=1 count=1 || true + +log_must zfs snapshot $TESTPOOL/$TESTFS@snap +log_must zfs clone $TESTPOOL/$TESTFS@snap $TESTPOOL/testclone + +# Look to see that snapshot, clone and filesystem our files report errors +log_must eval "zpool status -v | grep '$TESTPOOL/$TESTFS@snap:/10m_file'" +log_must eval "zpool status -v | grep '$TESTPOOL/testclone/10m_file'" +log_must eval "zpool status -v | grep '$TESTDIR/10m_file'" + +log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh new file mode 100755 index 000000000000..849641f5e148 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh @@ -0,0 +1,81 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, by Delphix. All rights reserved. +# Copyright (c) 2021, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify feature@head_errlog=disabled works. +# +# STRATEGY: +# 1. Create a file system with feature@head_errlog=disabled +# 2. zinject checksum errors +# 3. Read the file +# 4. Take a snapshot and make a clone +# 5. Verify that zpool status displays the old behaviour. + +function cleanup +{ + log_must zinject -c all + datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL +} + +verify_runnable "both" + +log_assert "Verify 'feature@head_errlog=disabled' works" +log_onexit cleanup + +log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL $DISKS +log_must zfs create $TESTPOOL/$TESTFS +if [[ -z $no_mountpoint ]]; then + log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS +fi + +state=$(zpool list -Ho feature@head_errlog $TESTPOOL) +if [[ "$state" != "disabled" ]]; then + log_fail "head_errlog has state $state" +fi + +log_must mkfile 10m $TESTDIR/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +log_must zinject -t data -e checksum -f 100 $TESTDIR/10m_file + +# Try to read the 2nd megabyte of 10m_file +dd if=$TESTDIR/10m_file bs=1M skip=1 count=1 || true + +log_must zfs snapshot $TESTPOOL/$TESTFS@snap +log_must zfs clone $TESTPOOL/$TESTFS@snap $TESTPOOL/testclone + +# Look to see that snapshot, clone and filesystem our files report errors +log_mustnot eval "zpool status -v | grep '$TESTPOOL/$TESTFS@snap:/10m_file'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL/testclone/10m_file'" +log_must eval "zpool status -v | grep '$TESTDIR/10m_file'" + +log_pass "'feature@head_errlog=disabled works"