diff --git a/include/sys/spa.h b/include/sys/spa.h index 49432684346e..f963f06358ac 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1175,11 +1175,14 @@ extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); -extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); +extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx); +extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, + uint64_t old_head_ds, dmu_tx_t *tx); /* vdev cache */ extern void vdev_cache_stat_init(void); diff --git a/include/sys/zio.h b/include/sys/zio.h index bf441ff8d511..4b4fdc70f536 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -305,6 +305,13 @@ struct zbookmark_phys { uint64_t zb_blkid; }; +typedef struct zbookmark_err_phys { + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; + uint64_t zb_birth; +} zbookmark_err_phys_t; + #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 4f78229736e3..7977854039d9 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -72,6 +72,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, + SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURES } spa_feature_t; diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 6c03f9049066..269653ba9240 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -925,5 +925,27 @@ is used to checkpoint the pool. The feature will only return back to being \fBenabled\fR when the pool is rewound or the checkpoint has been discarded. + +.sp +.ne 2 +.na +\fBhead_errlog\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:head_errlog +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables the upgraded version of errlog. This required on disk +error format change. Now, errorlog of each head dataset is stored +seprately in zap object and primarily keyed by this head Id. +By this feature every dataset affected by an error block is listed under +\fBzpool status\fR command. + +The feature can only be set at the time of pool creation. + .SH "SEE ALSO" zpool(8) diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 8e1aef5daa77..3c931ce71c99 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -553,6 +553,13 @@ zpool_feature_init(void) "com.datto:resilver_defer", "resilver_defer", "Support for defering new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + zfeature_register(SPA_FEATURE_HEAD_ERRLOG, + "com.delphix:head_errlog", "head_errlog", + "Improved zpool status command output.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); + } } #if defined(_KERNEL) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 069a66ecbc3b..2e94a91135e5 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3726,6 +3726,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); + + /* + * Transfer common error blocks from old head to new head. + */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t old_head = origin_head->ds_object; + uint64_t new_head = hds->ds_object; + spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); + } } /* diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 5c483c5dd961..52e5aeb6230e 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1132,6 +1132,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } + /* Delete errlog. */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) + spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx); } void diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index e42f8a0212f6..afcdaf3deabb 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -53,7 +53,17 @@ #include #include #include +#include +#include +#include +#ifdef _KERNEL +static int check_filesystem(spa_t *spa, uint64_t head_ds, + zbookmark_err_phys_t *zep, uint64_t *count, void *addr, + boolean_t only_count); +static uint64_t get_errlog_size(spa_t *spa, uint64_t spa_err_obj); +static uint64_t get_errlist_size(spa_t *spa, avl_tree_t *tree); +#endif /* * Convert a bookmark to a string. @@ -67,9 +77,20 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) } /* - * Convert a string to a bookmark + * Convert a err_phys to a string. */ +static void +errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, + (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); +} + #ifdef _KERNEL +/* + * Convert a string to a bookmark. + */ static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { @@ -82,8 +103,91 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } + +/* + * Convert a string to a err_phys. + */ +static void +name_to_errphys(char *buf, zbookmark_err_phys_t *zep) +{ + zep->zb_object = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_birth = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +zeb_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) +{ + zb->zb_objset = dataset; + zb->zb_object = zep->zb_object; + zb->zb_level = zep->zb_level; + zb->zb_blkid = zep->zb_blkid; +} #endif +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static int +get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, + uint64_t *head_dataset_id) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + dsl_pool_config_enter(dp, FTAG); + int err = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + return (EFAULT); + } + *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (EFAULT); + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep->zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (EFAULT); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (err != 0 || BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (EFAULT); + } + + zep->zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (0); +} + /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk @@ -136,83 +240,440 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) uint64_t spa_get_errlog_size(spa_t *spa) { - uint64_t total = 0, count; - - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); - - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); + uint64_t total = 0; + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t count; + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + } else { +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + total += get_errlog_size(spa, spa->spa_errlog_scrub); + total += get_errlog_size(spa, spa->spa_errlog_last); + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += get_errlist_size(spa, &spa->spa_errlist_last); + total += get_errlist_size(spa, &spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); +#endif + } return (total); } #ifdef _KERNEL static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +find_block_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg) +{ + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + return (ENOENT); + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep->zb_object, FTAG, &dn) != 0) { + return (ENOENT); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int err = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (err != 0 || BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (ENOENT); + } + + *birth_txg = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (0); +} + +static int +check_clones(spa_t *spa, uint64_t *snapshot, uint64_t snapshot_count, + uint64_t zap_clone, zbookmark_err_phys_t *zep, uint64_t *count, + void *addr, boolean_t only_count) { zap_cursor_t zc; zap_attribute_t za; - zbookmark_phys_t zb; - if (obj == 0) - return (0); - - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + for (zap_cursor_init(&zc, spa->spa_meta_objset, zap_clone); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { - if (*count == 0) { + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *clone; + int err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, + &clone); + + if (err != 0) { zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); + return (EFAULT); } - name_to_bookmark(za.za_name, &zb); + boolean_t found = B_FALSE; + for (int i = 0; i < snapshot_count; i++) { + if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj + == snapshot[i]) { + found = B_TRUE; + } + } + dsl_dataset_rele(clone, FTAG); - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); + if (!found) { + continue; + } + + check_filesystem(spa, za.za_first_integer, zep, count, addr, + only_count); + } + zap_cursor_fini(&zc); + return (0); +} + +static int +check_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + uint64_t *count, void *addr, boolean_t only_count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, fs, FTAG, &ds) != 0) + return (EFAULT); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + if (find_block_txg(ds, zep, &latest_txg) == 0) { + if (zep->zb_birth < latest_txg) { + txg_to_consider = latest_txg; + } else { + /* Block neither free nor re written. */ + if (!only_count) { + zbookmark_phys_t zb; + zeb_to_zb(fs, zep, &zb); + if (copyout(&zb, (char *)addr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + (*count)--; + } else { + (*count)++; + } + check_snapshot = B_FALSE; + } + } + + uint64_t snap_count; + if (zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count) != 0) { + dsl_dataset_rele(ds, FTAG); + return (EFAULT); + } + + if (snap_count == 0) { + /* File system has no snapshot. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + KM_SLEEP); + + int aff_snap_count = 0; + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) return (SET_ERROR(EFAULT)); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + break; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + affected = B_FALSE; + uint64_t blk_txg; + if (find_block_txg(ds, zep, &blk_txg) == 0 && + zep->zb_birth == blk_txg) { + affected = B_TRUE; + } } - *count -= 1; + if (affected) { + snap_obj_array[aff_snap_count] = snap_obj; + aff_snap_count++; + + if (!only_count) { + zbookmark_phys_t zb; + zeb_to_zb(snap_obj, zep, &zb); + if (copyout(&zb, (char *)addr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + goto out; + } + (*count)--; + } else { + (*count)++; + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + if (zap_clone != 0 && aff_snap_count > 0) { + check_clones(spa, snap_obj_array, aff_snap_count, zap_clone, + zep, count, addr, only_count); + } + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (0); +out: + dsl_dataset_rele(ds, FTAG); + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (SET_ERROR(EFAULT)); +} + +static uint64_t +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds) != 0) + return (EFAULT); + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t top_affected_fs = head_ds; + + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg) { + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) + break; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; } + dsl_dataset_rele(ds, FTAG); + return (top_affected_fs); +} + +static int +process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *addr, boolean_t only_count) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_pool_config_enter(dp, FTAG); + uint64_t top_affected_fs = find_top_affected_fs(spa, head_ds, zep); + int error = check_filesystem(spa, top_affected_fs, zep, count, addr, + only_count); + dsl_pool_config_exit(dp, FTAG); + return (error); +} + +static uint64_t +get_errlog_size(spa_t *spa, uint64_t spa_err_obj) +{ + uint64_t total = 0; + if (spa_err_obj == 0) + return (total); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + if (process_error_block(spa, head_ds, &head_ds_block, + &total, NULL, B_TRUE) != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (total); + } + } + zap_cursor_fini(&head_ds_cursor); + } zap_cursor_fini(&zc); + return (total); +} +static uint64_t +get_errlist_size(spa_t *spa, avl_tree_t *tree) +{ + uint64_t total = 0; + + if (avl_numnodes(tree) == 0) + return (total); + + spa_error_entry_t *se; + for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + uint64_t head_ds_obj; + get_head_and_birth_txg(spa, &zep, se->se_bookmark.zb_objset, + &head_ds_obj); + + if (process_error_block(spa, head_ds_obj, &zep, &total, NULL, + B_TRUE) != 0) { + return (SET_ERROR(EFAULT)); + } + } + return (total); +} +#endif + +/* + * If an error block is shared by two dataset it would be counted twice. For + * detailed message see spa_get_errlog_size() above. + */ + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, uint64_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (obj == 0) + return (0); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + zbookmark_phys_t zb; + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + *count -= 1; + + } + zap_cursor_fini(&zc); + return (0); + } + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds_err_obj = za.za_first_integer; + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + int err; + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + err = process_error_block(spa, head_ds, &head_ds_block, + count, addr, B_FALSE); + + if (err != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + } + zap_cursor_fini(&head_ds_cursor); + } + zap_cursor_fini(&zc); return (0); } static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) +process_error_list(spa_t *spa, avl_tree_t *list, void *addr, uint64_t *count) { spa_error_entry_t *se; - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (se = avl_first(list); se != NULL; + se = AVL_NEXT(list, se)) { - if (*count == 0) - return (SET_ERROR(ENOMEM)); + if (*count == 0) + return (SET_ERROR(ENOMEM)); - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); - *count -= 1; + *count -= 1; + } + return (0); } + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + uint64_t head_ds_obj; + get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj); + + if (process_error_block(spa, head_ds_obj, &zep, + count, addr, B_FALSE) != 0) { + return (SET_ERROR(EFAULT)); + } + + } return (0); } #endif @@ -229,7 +690,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) * the error list lock when we are finished. */ int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; @@ -244,10 +705,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) mutex_enter(&spa->spa_errlist_lock); if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); @@ -304,28 +765,76 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) char buf[64]; void *cookie; - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); + if (avl_numnodes(t) == 0) { + return; + } + + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - /* add errors to the current log */ + /* add errors to the current log */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { char *name = se->se_name ? se->se_name : ""; bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, + strlen(name) + 1, name, tx); + } + } else { + for (se = avl_first(t); se != NULL; + se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + uint64_t head_dataset_obj; + get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_dataset_obj); + uint64_t err_obj; + int error = zap_lookup_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, &err_obj); + + if (error != 0) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, err_obj, tx); + } + errphys_to_name(&zep, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); + err_obj, buf, 1, strlen(name) + 1, name, tx); } + } + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); +} - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); +static void +delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx) == 0); + } + zap_cursor_fini(&zc); } + VERIFY(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx) == 0); } /* @@ -376,8 +885,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (scrub_finished) { if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); + delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; @@ -404,6 +912,134 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_errlog_lock); } +static void +delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, + dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + if (head_ds == ds) { + (void) zap_remove(spa->spa_meta_objset, spa_err_obj, + za.za_name, tx); + VERIFY(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx) == 0); + break; + } + } + zap_cursor_fini(&zc); +} + +void +spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); + delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + +#if defined(_KERNEL) +static uint64_t +find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + if (dsl_dataset_hold_obj(dp, old_head, FTAG, &ds) != 0) + return (EFAULT); + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (snap_obj != 0) { + dsl_dataset_rele(ds, FTAG); + if (dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds) != 0) + continue; + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj + == new_head) { + break; + } + + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + ASSERT(snap_obj != 0); + return (snap_obj_txg); +} + +static void +swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t + old_head, dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + uint64_t old_head_errlog; + int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, + old_head, &old_head_errlog); + + /* Check if file system being depromoted has errlog. */ + if (error != 0) + return; + + uint64_t txg = find_txg_ancestor_snapshot(spa, new_head, old_head); + + /* + * Check if file system being promoted already has errlog otherwise + * create zap object. + */ + uint64_t new_head_errlog; + error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, + &new_head_errlog); + + if (error != 0) { + new_head_errlog = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, + new_head, new_head_errlog, tx); + } + + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_err_phys_t err_block; + for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + char *name = ""; + name_to_errphys(za.za_name, &err_block); + if (err_block.zb_birth < txg) { + (void) zap_update(spa->spa_meta_objset, new_head_errlog, + za.za_name, 1, strlen(name) + 1, name, tx); + + (void) zap_remove(spa->spa_meta_objset, old_head_errlog, + za.za_name, tx); + } + } + zap_cursor_fini(&zc); +} +#endif + +void +spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, + dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); +#if defined(_KERNEL) + swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); + swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); +#endif + mutex_exit(&spa->spa_errlog_lock); +} + #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); @@ -413,4 +1049,6 @@ EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); +EXPORT_SYMBOL(spa_delete_dataset_errlog); +EXPORT_SYMBOL(spa_swap_errlog); #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 9f4991162ff4..74eebb35c174 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5432,7 +5432,7 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; + uint64_t count = (uint64_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index f8f04d73ef96..f8b0e93ba6ef 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -462,7 +462,8 @@ tests = ['zpool_split_cliargs', 'zpool_split_devices', tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] -tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] +tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', + 'zpool_status_004_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am index beb59e3d066b..5726afce2976 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am @@ -3,4 +3,6 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zpool_status_001_pos.ksh \ - zpool_status_002_pos.ksh + zpool_status_002_pos.ksh \ + zpool_status_003_pos.ksh \ + zpool_status_004_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh new file mode 100755 index 000000000000..33220e73a3df --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify correct output with 'zpool status -v' after corrupting a file +# +# STRATEGY: +# 1. Create a file system +# 2. zinject checksum errors +# 3. Read the file +# 4. Take a snapshot and make a clone +# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' + +function cleanup +{ + log_must zfs destroy -R $TESTPOOL/$TESTFS@snap + log_must zinject -c all +} + +verify_runnable "both" + +log_assert "Verify correct 'zpool status -v' output with a corrupted file" +log_onexit cleanup + +log_must mkfile 10m $TESTDIR/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +log_must zinject -t data -e checksum -f 100 $TESTDIR/10m_file + +# Try to read the 2nd megabyte of 10m_file +dd if=$TESTDIR/10m_file bs=1M skip=1 count=1 || true + +log_must zfs snapshot $TESTPOOL/$TESTFS@snap +log_must zfs clone $TESTPOOL/$TESTFS@snap $TESTPOOL/testclone + +# Look to see that snapshot, clone and filesystem our files report errors +log_must eval "zpool status -v | grep '$TESTPOOL/$TESTFS@snap:/10m_file'" +log_must eval "zpool status -v | grep '$TESTPOOL/testclone/10m_file'" +log_must eval "zpool status -v | grep '$TESTDIR/10m_file'" + +log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh new file mode 100755 index 000000000000..ec03f39c5aa8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_004_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify feature@headerrlog=disabled works perfectly. +# +# STRATEGY: +# 1. Create a file system +# 2. zinject checksum errors +# 3. Read the file +# 4. Take a snapshot and make a clone +# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' + +function cleanup +{ + log_must zfs destroy -R $TESTPOOL/$TESTFS@snap + log_must zinject -c all +} + +verify_runnable "both" + +log_assert "Verify correct 'zpool status -v' output with a corrupted file" +log_onexit cleanup + +DISK=${DISKS%% *} + +log_must zpool create -f -o feature@headerrlog=disabled $TESTPOOL $DISK + +log_must mkfile 10m $TESTDIR/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +log_must zinject -t data -e checksum -f 100 $TESTDIR/10m_file + +# Try to read the 2nd megabyte of 10m_file +dd if=$TESTDIR/10m_file bs=1M skip=1 count=1 || true + +log_must zfs snapshot $TESTPOOL/$TESTFS@snap +log_must zfs clone $TESTPOOL/$TESTFS@snap $TESTPOOL/testclone + +# Look to see that snapshot, clone and filesystem our files report errors +log_mustnot eval "zpool status -v | grep '$TESTPOOL/$TESTFS@snap:/10m_file'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL/testclone/10m_file'" +log_must eval "zpool status -v | grep '$TESTDIR/10m_file'" + +log_pass "'feature@headerrlog=disabled works perfectly"