From d91e2ca0162e1235551df98ef86a61245ddc17d1 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Fri, 17 Dec 2021 21:35:28 +0100 Subject: [PATCH] Teach zpool scrub to scrub only blocks in error log Added a flag -e in zpool scrub to scrub only blocks in error log. A user can pause, resume and cancel the error scrub by passing additional command line arguments -p -s just like a regular scrub. This involves adding a new flag, creating new libzfs interfaces, a new ioctl, and the actual iteration and read-issuing logic. Error scrubbing is executed in multiple txg to make sure pool performance is not affected. Co-authored-by: TulsiJain tulsi.jain@delphix.com Signed-off-by: George Amanakis --- cmd/zpool/zpool_main.c | 111 ++- include/libzfs.h | 4 + include/libzfs_core.h | 2 + include/sys/dmu.h | 1 + include/sys/dsl_scan.h | 27 +- include/sys/fs/zfs.h | 19 +- include/sys/spa.h | 8 + include/sys/spa_impl.h | 4 + include/sys/sysevent/eventdefs.h | 5 + lib/libzfs/libzfs.abi | 3 +- lib/libzfs/libzfs_pool.c | 92 ++- lib/libzfs/libzfs_util.c | 17 +- lib/libzfs_core/libzfs_core.abi | 105 +++ lib/libzfs_core/libzfs_core.c | 7 + man/man8/zpool-scrub.8 | 18 + module/zfs/dsl_scan.c | 699 +++++++++++++++++- module/zfs/spa.c | 2 + module/zfs/spa_errlog.c | 81 +- module/zfs/spa_misc.c | 25 +- module/zfs/zfs_ioctl.c | 44 ++ tests/runfiles/common.run | 4 +- tests/zfs-tests/cmd/libzfs_input_check.c | 15 + tests/zfs-tests/include/libtest.shlib | 18 + tests/zfs-tests/tests/Makefile.am | 3 + .../zpool_scrub/zpool_error_scrub_001_pos.ksh | 79 ++ .../zpool_scrub/zpool_error_scrub_002_pos.ksh | 99 +++ .../zpool_scrub/zpool_error_scrub_003_pos.ksh | 109 +++ 27 files changed, 1533 insertions(+), 68 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 301c5f4bfc6f..703d84044003 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -401,7 +401,7 @@ get_usage(zpool_help_t idx) return (gettext("\tinitialize [-c | -s] [-w] " "[ ...]\n")); case HELP_SCRUB: - return (gettext("\tscrub [-s | -p] [-w] ...\n")); + return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: @@ -7297,8 +7297,9 @@ wait_callback(zpool_handle_t *zhp, void *data) } /* - * zpool scrub [-s | -p] [-w] ... + * zpool scrub [-s | -p] [-w] [-e] ... * + * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. @@ -7314,14 +7315,21 @@ zpool_do_scrub(int argc, char **argv) cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + boolean_t is_error_scrub = B_FALSE; + boolean_t is_pause = B_FALSE; + boolean_t is_stop = B_FALSE; + /* check options */ - while ((c = getopt(argc, argv, "spw")) != -1) { + while ((c = getopt(argc, argv, "spwe")) != -1) { switch (c) { + case 'e': + is_error_scrub = B_TRUE; + break; case 's': - cb.cb_type = POOL_SCAN_NONE; + is_stop = B_TRUE; break; case 'p': - cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + is_pause = B_TRUE; break; case 'w': wait = B_TRUE; @@ -7333,11 +7341,21 @@ zpool_do_scrub(int argc, char **argv) } } - if (cb.cb_type == POOL_SCAN_NONE && - cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { - (void) fprintf(stderr, gettext("invalid option combination: " - "-s and -p are mutually exclusive\n")); + if (is_pause && is_stop) { + (void) fprintf(stderr, gettext("invalid option " + "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); + } else { + if (is_error_scrub) + cb.cb_type = POOL_SCAN_ERRORSCRUB; + + if (is_pause) { + cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + } else if (is_stop) { + cb.cb_type = POOL_SCAN_NONE; + } else { + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + } } if (wait && (cb.cb_type == POOL_SCAN_NONE || @@ -7561,6 +7579,70 @@ secs_to_dhms(uint64_t total, char *buf) } } +/* + * Print out detailed error scrub status. + */ +static void +print_err_scrub_status(pool_scan_stat_t *ps) +{ + time_t start, end, pause; + uint64_t total_secs_left; + uint64_t secs_left, mins_left, hours_left, days_left; + uint64_t examined, to_be_examined; + + if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) { + return; + } + + (void) printf(gettext(" scrub: ")); + + start = ps->pss_error_scrub_start; + end = ps->pss_error_scrub_end; + pause = ps->pss_pass_error_scrub_pause; + examined = ps->pss_error_scrub_examined; + to_be_examined = ps->pss_error_scrub_to_be_examined; + + assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); + + if (ps->pss_error_scrub_state == DSS_FINISHED) { + total_secs_left = end - start; + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); + + (void) printf(gettext("scrubbed %llu error blocks in %llu days " + "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + ctime(&end)); + + return; + } else if (ps->pss_error_scrub_state == DSS_CANCELED) { + (void) printf(gettext("error scrub canceled on %s"), + ctime(&end)); + return; + } + assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING); + + /* Error scrub is in progress. */ + if (pause == 0) { + (void) printf(gettext("error scrub in progress since %s"), + ctime(&start)); + } else { + (void) printf(gettext("error scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\terror scrub started on %s"), + ctime(&start)); + } + + double fraction_done = (double)examined / (to_be_examined + examined); + (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error" + " blocks"), 100 * fraction_done, (u_longlong_t)examined); + + (void) printf("\n"); +} + /* * Print out detailed scrub status. */ @@ -7897,10 +7979,12 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t have_errorscrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; + time_t scrub_start = 0, errorscrub_start = 0; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { @@ -7909,16 +7993,23 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) active_resilver = (ps->pss_state == DSS_SCANNING); } + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + scrub_start = ps->pss_start_time; + have_errorscrub = (ps->pss_error_scrub_func == + POOL_SCAN_ERRORSCRUB); + errorscrub_start = ps->pss_error_scrub_start; } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ - if (have_scrub) + if (have_scrub && scrub_start > errorscrub_start) print_scan_scrub_resilver_status(ps); + else if (have_errorscrub && errorscrub_start >= scrub_start) + print_err_scrub_status(ps); /* * When there is an active resilver or rebuild print its status. diff --git a/include/libzfs.h b/include/libzfs.h index 87d1ed738f2b..bda90e51f4c1 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -125,11 +125,15 @@ typedef enum zfs_error { EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_ERRORSCRUBBING, /* currently error scrubbing */ + EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */ EZFS_NO_SCRUB, /* no active scrub */ + EZFS_NO_ERRORSCRUB, /* no active error scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ + EZFS_SCRUB_PAUSED_TO_CANCEL, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 14a4857c35da..867c18b9c226 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -155,6 +155,8 @@ _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1b82ff620f27..61a4a9fbad9a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -378,6 +378,7 @@ typedef struct dmu_buf { #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" +#define DMU_POOL_ERRORSCRUB "error_scrub" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 8925b5815a37..6753b4a8f359 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -78,6 +79,21 @@ typedef enum dsl_scan_flags { #define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) +typedef struct dsl_errorscrub_phys { + uint64_t dep_func; /* pool_scan_func_t */ + uint64_t dep_state; /* dsl_scan_state_t */ + uint64_t dep_cursor; /* serialized zap cursor for tracing progress */ + uint64_t dep_start_time; /* error scrub start time, unix timestamp */ + uint64_t dep_end_time; /* error scrub end time, unix timestamp */ + uint64_t dep_to_examine; /* total error blocks to be scrubbed */ + uint64_t dep_examined; /* blocks scrubbed so far */ + uint64_t dep_errors; /* error scrub I/O error count */ + uint64_t dep_paused_flags; /* flag for paused */ +} dsl_errorscrub_phys_t; + +#define ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \ + / sizeof (uint64_t)) + /* * Every pool will have one dsl_scan_t and this structure will contain * in-memory information about the scan and a pointer to the on-disk @@ -151,11 +167,15 @@ typedef struct dsl_scan { uint64_t scn_avg_zio_size_this_txg; uint64_t scn_zios_this_txg; + /* zap cursor for tracing error scrub progress */ + zap_cursor_t errorscrub_cursor; /* members needed for syncing scan status to disk */ dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ uint64_t scn_queues_pending; /* outstanding data to issue */ + /* members needed for syncing error scrub status to disk */ + dsl_errorscrub_phys_t errorscrub_phys; } dsl_scan_t; typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; @@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); -int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); +boolean_t dsl_errorscrubbing(const struct dsl_pool *dp); +boolean_t dsl_errorscrub_active(dsl_scan_t *scn); void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, + pool_scrub_cmd_t cmd); +void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); @@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, struct dmu_tx *tx); boolean_t dsl_scan_active(dsl_scan_t *scn); boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn); void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 0734ff12280e..563301666428 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1036,6 +1036,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; @@ -1099,6 +1100,20 @@ typedef struct pool_scan_stat { uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ + + /* error scrub values stored on disk */ + uint64_t pss_error_scrub_func; /* pool_scan_func_t */ + uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ + uint64_t pss_error_scrub_start; /* error scrub start time */ + uint64_t pss_error_scrub_end; /* error scrub end time */ + uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ + /* error blocks to be issued I/O */ + uint64_t pss_error_scrub_to_be_examined; + + /* error scrub values not stored on disk */ + /* error scrub pause time in milliseconds */ + uint64_t pss_pass_error_scrub_pause; + } pool_scan_stat_t; typedef struct pool_removal_stat { @@ -1120,6 +1135,7 @@ typedef enum dsl_scan_state { DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, + DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; @@ -1359,7 +1375,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 81/128 numbers reserved. + * Core features - 88/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1454,6 +1470,7 @@ typedef enum zfs_ioc { ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ + ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ /* * Per-platform (Optional) - 8/128 numbers reserved. diff --git a/include/sys/spa.h b/include/sys/spa.h index b96a9ef1d42f..14e196f33bda 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1154,6 +1154,7 @@ extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); +extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); @@ -1164,6 +1165,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); +extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, + zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); +extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg); +extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, + zbookmark_phys_t *zb); +extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev cache */ extern void vdev_cache_stat_init(void); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5782c54bd78f..44afa763283a 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -295,6 +295,10 @@ struct spa { uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + /* error scrub pause time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_pause; + /* total error scrub paused time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index eb1dfd16c0fd..b159fa3c301f 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -123,6 +123,11 @@ extern "C" { #define ESC_ZFS_TRIM_CANCEL "trim_cancel" #define ESC_ZFS_TRIM_RESUME "trim_resume" #define ESC_ZFS_TRIM_SUSPEND "trim_suspend" +#define ESC_ZFS_ERRORSCRUB_START "scrub_start" +#define ESC_ZFS_ERRORSCRUB_FINISH "scrub_finish" +#define ESC_ZFS_ERRORSCRUB_ABORT "scrub_abort" +#define ESC_ZFS_ERRORSCRUB_RESUME "scrub_resume" +#define ESC_ZFS_ERRORSCRUB_PAUSED "scrub_paused" /* * datalink subclass definitions. diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 732863dcffc7..a37951b32b42 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5717,7 +5717,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 4fb71b4e0dc8..8118bee1988a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2648,24 +2648,38 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) int err; libzfs_handle_t *hdl = zhp->zpool_hdl; - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = func; - zc.zc_flags = cmd; - - if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) - return (0); + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, "scan_type", (uint64_t)func); + fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); - err = errno; + err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); + fnvlist_free(args); - /* ECANCELED on a scrub means we resumed a paused scrub */ - if (err == ECANCELED && func == POOL_SCAN_SCRUB && - cmd == POOL_SCRUB_NORMAL) + if (err == 0) return (0); - if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) + /* + * An ECANCELED on a scrub means one of the following: + * 1. we resumed a paused scrub. + * 2. we resumed a paused error scrub. + * 3. Error scrub is not run because of no error log. + */ + if (err == ECANCELED && (func == POOL_SCAN_SCRUB || + func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL) return (0); + /* + * The following cases have been handled here: + * 1. Paused a scrub/error scrub if there is none in progress. + */ + if (err == ENOENT && func != POOL_SCAN_NONE && cmd == + POOL_SCRUB_PAUSE) { + return (0); + } - if (func == POOL_SCAN_SCRUB) { + ASSERT3U(func, >=, POOL_SCAN_NONE); + ASSERT3U(func, <, POOL_SCAN_FUNCS); + + if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), @@ -2687,6 +2701,17 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) assert(!"unexpected result"); } + /* + * With EBUSY, five cases are possible: + * + * Current state Requested + * 1. Normal Scrub Running Normal Scrub or Error Scrub + * 2. Normal Scrub Paused Error Scrub + * 3. Normal Scrub Paused Pause Normal Scrub + * 4. Error Scrub Running Normal Scrub or Error Scrub + * 5. Error Scrub Paused Pause Error Scrub + * 6. Resilvering Anything else + */ if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; @@ -2698,16 +2723,49 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB && ps->pss_state == DSS_SCANNING) { - if (cmd == POOL_SCRUB_PAUSE) - return (zfs_error(hdl, EZFS_SCRUB_PAUSED, + if (ps->pss_pass_scrub_pause == 0) { + /* handles case 1 */ + assert(cmd == POOL_SCRUB_NORMAL); + return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); - else - return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); + } else { + if (func == POOL_SCAN_ERRORSCRUB) { + /* handles case 2 */ + ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); + return (zfs_error(hdl, + EZFS_SCRUB_PAUSED_TO_CANCEL, + errbuf)); + } else { + /* handles case 3 */ + ASSERT3U(func, ==, POOL_SCAN_SCRUB); + ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); + return (zfs_error(hdl, + EZFS_SCRUB_PAUSED, errbuf)); + } + } + } else if (ps && + ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB && + ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) { + if (ps->pss_pass_error_scrub_pause == 0) { + /* handles case 4 */ + ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); + return (zfs_error(hdl, EZFS_ERRORSCRUBBING, + errbuf)); + } else { + /* handles case 5 */ + ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB); + ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); + return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED, + errbuf)); + } } else { + /* handles case 6 */ return (zfs_error(hdl, EZFS_RESILVERING, errbuf)); } - } else if (err == ENOENT) { + } else if (err == ENOENT && func == POOL_SCAN_NONE) { return (zfs_error(hdl, EZFS_NO_SCRUB, errbuf)); + } else if (err == ENOENT && func == POOL_SCAN_ERRORSCRUB) { + return (zfs_error(hdl, EZFS_NO_ERRORSCRUB, errbuf)); } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) { return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, errbuf)); } else { diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 4b8a20160e02..55a540410653 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -243,12 +243,25 @@ libzfs_error_description(libzfs_handle_t *hdl) "into a new one")); case EZFS_SCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "scrub is paused; " - "use 'zpool scrub' to resume")); + "use 'zpool scrub' to resume scrub")); + case EZFS_SCRUB_PAUSED_TO_CANCEL: + return (dgettext(TEXT_DOMAIN, "scrub is paused; " + "use 'zpool scrub' to resume or 'zpool scrub -s' to " + "cancel scrub")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " - "use 'zpool scrub -s' to cancel current scrub")); + "use 'zpool scrub -s' to cancel scrub")); + case EZFS_ERRORSCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently error scrubbing; " + "use 'zpool scrub -s' to cancel error scrub")); + case EZFS_ERRORSCRUB_PAUSED: + return (dgettext(TEXT_DOMAIN, "error scrub is paused; " + "use 'zpool scrub -e' to resume error scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); + case EZFS_NO_ERRORSCRUB: + return (dgettext(TEXT_DOMAIN, "there is no active error " + "scrub")); case EZFS_DIFF: return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); case EZFS_DIFFDATA: diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index ec94a4650553..ad992bddb38b 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -187,6 +187,7 @@ + @@ -1260,6 +1261,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 254f14e04321..c63a16de5ab6 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -247,6 +247,13 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, return (error); } +int +lzc_scrub(zfs_ioc_t ioc, const char *name, + nvlist_t *source, nvlist_t **resultp) +{ + return (lzc_ioctl(ioc, name, source, resultp)); +} + int lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 1fdbb8a5d56d..2b0de54308bf 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -38,6 +38,7 @@ .Cm scrub .Op Fl s Ns | Ns Fl p .Op Fl w +.Op Fl e .Ar pool Ns … . .Sh DESCRIPTION @@ -92,9 +93,26 @@ Once resumed the scrub will pick up from the place where it was last checkpointed to disk. To resume a paused scrub issue .Nm zpool Cm scrub +or +.Nm zpool Cm scrub +.Fl e again. .It Fl w Wait until scrub has completed before returning. +.It Fl e +Only scrub blocks in the error log. +It is strongly advised to run error scrubs with keys loaded and filesystems +mounted in the case of encrypted filesystems. +Otherwise, errors in encrypted filesystems that are unmounted or have unloaded +keys will be reported as belonging to the root filesystem. +Error scrubs can also be paused and canceled with +.Fl p +and +.Fl s +respectively. +Error scrubbing is I/O-intensive and cannot run simultaneously with regular +scrubbing or resilvering. +It also cannot run if a regular scrub is in the paused state. .El .Sh EXAMPLES .Ss Example 1 diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d6a9365df120..dc0ae52b215a 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -54,6 +54,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -129,6 +130,7 @@ static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_data_disks(spa_t *spa); +static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -231,6 +233,10 @@ static int zfs_resilver_disable_defer = B_FALSE; */ static int zfs_free_bpobj_enabled = 1; +/* Error blocks to be scrubbed in one txg. */ +unsigned long zfs_scrub_error_blocks_in_one_txg = 1 << 12; +int zfs_error_scrub_min_time_ms = 1000; /* min millisecs to error scrub txg */ + /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, @@ -511,9 +517,17 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), + ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); + + if (err != 0 && err != ENOENT) + return (err); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); + /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify @@ -663,6 +677,22 @@ dsl_scan_scrubbing(const dsl_pool_t *dp) scn_phys->scn_func == POOL_SCAN_SCRUB); } +boolean_t +dsl_errorscrubbing(const dsl_pool_t *dp) +{ + dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; + + return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && + errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); +} + +boolean_t +dsl_errorscrub_is_paused(const dsl_scan_t *scn) +{ + return (dsl_errorscrubbing(scn->scn_dp) && + scn->errorscrub_phys.dep_paused_flags); +} + boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { @@ -670,6 +700,68 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn) scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } +static void +dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scn->errorscrub_phys.dep_cursor = + zap_cursor_serialize(&scn->errorscrub_cursor); + + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, + &scn->errorscrub_phys, tx)); +} + +static void +dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + scn->errorscrub_phys.dep_func = *funcp; + scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; + scn->errorscrub_phys.dep_start_time = gethrestime_sec(); + scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); + scn->errorscrub_phys.dep_examined = 0; + scn->errorscrub_phys.dep_errors = 0; + scn->errorscrub_phys.dep_cursor = 0; + zap_cursor_init_serialized(&scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + vdev_config_dirty(spa->spa_root_vdev); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); + + dsl_errorscrub_sync_state(scn, tx); + + spa_history_log_internal(spa, "error scrub setup", tx, + "func=%u mintxg=%u maxtxg=%llu", + *funcp, 0, (u_longlong_t)tx->tx_txg); +} + +static int +dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { + return (SET_ERROR(EBUSY)); + } + + if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { + return (ECANCELED); + } + return (0); +} + /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always @@ -745,7 +837,8 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || + dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); @@ -754,6 +847,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; @@ -763,6 +857,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); + + /* + * If we are starting a fresh scrub, we erase the error scrub + * information from disk. + */ + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + dsl_errorscrub_sync_state(scn, tx); + scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; @@ -856,8 +958,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, + * error scrub or resilver. Can also be called to resume a paused scrub or + * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) @@ -883,6 +986,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB) { + if (dsl_errorscrub_is_paused(dp->dp_scan)) { + /* + * got error scrub start cmd, resume paused error scrub. + */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, + &func, 0, ZFS_SPACE_CHECK_RESERVED)); + } + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, @@ -891,7 +1014,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } - return (SET_ERROR(err)); } @@ -899,6 +1021,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +static void +dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + if (complete) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); + spa_history_log_internal(spa, "error scrub done", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } else { + spa_history_log_internal(spa, "error scrub canceled", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } + + scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; + spa->spa_scrub_active = B_FALSE; + spa_errlog_rotate(spa); + scn->errorscrub_phys.dep_end_time = gethrestime_sec(); + zap_cursor_fini(&scn->errorscrub_cursor); + + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) + spa->spa_errata = 0; + + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); +} + static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { @@ -1045,6 +1194,92 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); } +static int +dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* + * can't pause a error scrub when there is no in-progress + * error scrub. + */ + if (!dsl_errorscrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused error scrub */ + if (dsl_errorscrub_is_paused(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); + scn->errorscrub_phys.dep_paused_flags = B_TRUE; + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_errorscrub_is_paused(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the error scrub + * rate shown in the output of 'zpool status'. + */ + spa->spa_scan_pass_errorscrub_spent_paused += + gethrestime_sec() - + spa->spa_scan_pass_errorscrub_pause; + + spa->spa_scan_pass_errorscrub_pause = 0; + scn->errorscrub_phys.dep_paused_flags = B_FALSE; + + zap_cursor_init_serialized( + &scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + dsl_errorscrub_sync_state(scn, tx); + } + } +} + +static int +dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + /* can't cancel a error scrub when there is no one in-progress */ + if (!dsl_errorscrubbing(scn->scn_dp)) + return (SET_ERROR(ENOENT)); + return (0); +} + +static void +dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_errorscrub_done(scn, B_FALSE, tx); + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_ABORT); +} + static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { @@ -1070,6 +1305,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, + NULL, 3, ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } @@ -1136,6 +1376,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_pause_resume_check, + dsl_errorscrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); @@ -1422,6 +1668,42 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) return (B_FALSE); } +static boolean_t +dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* + * We suspend if: + * - we have scrubbed for at least the minimum time (default 1 sec + * for error scrub), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + */ + uint64_t curr_time_ns = gethrtime(); + uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int mintime = zfs_error_scrub_min_time_ms; + + if ((NSEC2MSEC(error_scrub_time_ns) > mintime && + (txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("error scrub suspending at bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + } + return (B_TRUE); + } + return (B_FALSE); +} + typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; @@ -3351,6 +3633,19 @@ dsl_scan_active(dsl_scan_t *scn) return ((used != 0) || (clones_left)); } +boolean_t +dsl_errorscrub_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if (dsl_errorscrubbing(scn->scn_dp)) + return (B_TRUE); + return (B_FALSE); +} + static boolean_t dsl_scan_check_deferred(vdev_t *vd) { @@ -3567,6 +3862,386 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) return (0); } +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static void +read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) + return; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + /* + * If the key is not loaded dbuf_dnode_findbp() will error out with + * EACCES. However in that case dnode_hold() will eventually call + * dbuf_read()->zio_wait() which may call spa_log_error(). This will + * lead to a deadlock due to us holding the mutex spa_errlist_lock. + * Avoid this by checking here if the keys are loaded, if not return. + * If the keys are not loaded the head_errlog feature is meaningless + * as we cannot figure out the birth txg of the block pointer. + */ + if (dsl_dataset_get_keystatus(ds->ds_dir) == + ZFS_KEYSTATUS_UNAVAILABLE) { + dsl_dataset_rele(ds, FTAG); + return; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, + NULL); + + if (error) { + rw_exit(&dn->dn_struct_rwlock); + dsl_dataset_rele(ds, FTAG); + return; + } + + if (!error && BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; + + /* If it's an intent log block, failure is expected. */ + if (zb.zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + ASSERT(!BP_IS_EMBEDDED(&bp)); + scan_exec_io(dp, &bp, zio_flags, &zb, NULL); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * We keep track of the scrubbed error blocks in "count". This will be used + * when deciding whether we exceeded zfs_scrub_error_blocks_in_one_txg. This + * function is modelled after check_filesystem(). + */ +static int +scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + int *count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + + error = find_birth_txg(ds, zep, &latest_txg); + + /* + * If find_birth_txg() errors out, then err on the side of caution and + * proceed. In worst case scenario scrub all objects. If zep->zb_birth + * is 0 (e.g. in case of encryption with unloaded keys) also proceed to + * scrub all objects. + */ + if (error == 0 && zep->zb_birth == latest_txg) { + /* Block neither free nor re written. */ + zbookmark_phys_t zb; + zep_to_zb(fs, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_in_one_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + + check_snapshot = B_FALSE; + } else { + txg_to_consider = latest_txg; + } + + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + dsl_dataset_rele(ds, FTAG); + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + + /* + * Scrub the snapshot also when zb_birth == 0 or when + * find_birth_txg() returns an error. + */ + affected = (error == 0 && zep->zb_birth == blk_txg) || + (error != 0) || (zep->zb_birth == 0); + } + + /* Scrub snapshots. */ + if (affected) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_in_one_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (EFAULT); + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + return (0); +} + +void +dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Only process scans in sync pass 1. + */ + + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { + return; + } + + if (dsl_scan_resilvering(scn->scn_dp)) { + /* cancel the error scrub if resilver started */ + dsl_scan_cancel(scn->scn_dp); + return; + } + + spa->spa_scrub_active = B_TRUE; + scn->scn_sync_start_time = gethrtime(); + + /* + * zfs_scan_suspend_progress can be set to disable scrub progress. + * See more detailed comment in dsl_scan_sync(). + */ + if (zfs_scan_suspend_progress) { + uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; + int mintime = zfs_error_scrub_min_time_ms; + + while (zfs_scan_suspend_progress && + !txg_sync_waiting(scn->scn_dp) && + !spa_shutting_down(scn->scn_dp->dp_spa) && + NSEC2MSEC(scan_time_ns) < mintime) { + delay(hz); + scan_time_ns = gethrtime() - scn->scn_sync_start_time; + } + return; + } + + int i = 0; + zap_attribute_t *za; + zbookmark_phys_t *zb; + boolean_t limit_exceeded = B_FALSE; + + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + name_to_bookmark(za->za_name, zb); + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); + read_by_block_level(scn, *zb); + dsl_pool_config_exit(dp, FTAG); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined += 1; + scn->errorscrub_phys.dep_to_examine -= 1; + i++; + if (i == zfs_scrub_error_blocks_in_one_txg || + dsl_error_scrub_check_suspend(scn, zb)) { + limit_exceeded = B_TRUE; + break; + } + } + + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + return; + } + + int error = 0; + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; + uint64_t head_ds; + name_to_object(za->za_name, &head_ds); + boolean_t config_held = B_FALSE; + uint64_t top_affected_fs; + + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { + + name_to_errphys(head_ds_attr->za_name, &head_ds_block); + + /* + * In case we are called from spa_sync the pool + * config is already held. + */ + if (!dsl_pool_config_held(dp)) { + dsl_pool_config_enter(dp, FTAG); + config_held = B_TRUE; + } + + error = find_top_affected_fs(spa, + head_ds, &head_ds_block, &top_affected_fs); + if (error) + break; + + error = scrub_filesystem(spa, top_affected_fs, + &head_ds_block, &i); + + if (error == SET_ERROR(EFAULT)) { + limit_exceeded = B_TRUE; + break; + } + } + + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + if (config_held) + dsl_pool_config_exit(dp, FTAG); + } + + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); +} + /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we @@ -4108,7 +4783,14 @@ dsl_scan_scrub_done(zio_t *zio) if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + if (dsl_errorscrubbing(spa->spa_dsl_pool) && + !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan + ->errorscrub_phys.dep_errors); + } else { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys + .scn_errors); + } } } @@ -4558,3 +5240,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); + +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_in_one_txg, ULONG, ZMOD_RW, + "Error blocks to be scrubbed in one txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, error_scrub_min_time_ms, INT, ZMOD_RW, + "Min millisecs to scrub error blocks per txg"); +/* END CSTYLED */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c2a67fbc7c55..645f999c05a2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8166,6 +8166,7 @@ spa_scan_stop(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); } @@ -9242,6 +9243,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index e0604c4a84af..506c1973615e 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -110,7 +110,7 @@ errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) /* * Convert a string to a err_phys. */ -static void +void name_to_errphys(char *buf, zbookmark_err_phys_t *zep) { zep->zb_object = zfs_strtonum(buf, &buf); @@ -139,8 +139,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) ASSERT(*buf == '\0'); } -#ifdef _KERNEL -static void +void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { zb->zb_objset = dataset; @@ -148,7 +147,6 @@ zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) zb->zb_level = zep->zb_level; zb->zb_blkid = zep->zb_blkid; } -#endif static void name_to_object(char *buf, uint64_t *obj) @@ -238,8 +236,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) mutex_exit(&spa->spa_errlist_lock); } -#ifdef _KERNEL -static int +int find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg) { @@ -267,6 +264,33 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, return (error); } +/* + * This function finds the oldest affected filesystem containing an error + * block. + */ +int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, + FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +#ifdef _KERNEL /* * Copy the bookmark to the end of the user-space buffer which starts at * uaddr and has *count unused entries, and decrement *count by 1. @@ -288,7 +312,8 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) * Each time the error block is referenced by a snapshot or clone, add a * zbookmark_phys_t entry to the userspace array at uaddr. The array is * filled from the back and the in-out parameter *count is modified to be the - * number of unused entries at the beginning of the array. + * number of unused entries at the beginning of the array. The function + * scrub_filesystem() is modelled after this one. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, @@ -463,28 +488,6 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); } -static int -find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - uint64_t *top_affected_fs) -{ - uint64_t oldest_dsobj; - int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, - &oldest_dsobj); - if (error != 0) - return (error); - - dsl_dataset_t *ds; - error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, - FTAG, &ds); - if (error != 0) - return (error); - - *top_affected_fs = - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); - return (0); -} - static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) @@ -539,6 +542,21 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } #endif +/* Return the number of errors in the error log */ +uint64_t +spa_get_last_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + mutex_enter(&spa->spa_errlog_lock); + + if (spa->spa_errlog_last != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + return (total); +} + /* * If a healed bookmark matches an entry in the error log we stash it in a tree * so that we can later remove the related log entries in sync context. @@ -1354,6 +1372,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, /* error handling */ EXPORT_SYMBOL(spa_log_error); EXPORT_SYMBOL(spa_approx_errlog_size); +EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); @@ -1363,6 +1382,10 @@ EXPORT_SYMBOL(spa_delete_dataset_errlog); EXPORT_SYMBOL(spa_swap_errlog); EXPORT_SYMBOL(sync_error_list); EXPORT_SYMBOL(spa_upgrade_errlog); +EXPORT_SYMBOL(find_top_affected_fs); +EXPORT_SYMBOL(find_birth_txg); +EXPORT_SYMBOL(zep_to_zb); +EXPORT_SYMBOL(name_to_errphys); #endif /* BEGIN CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 54a0eeccf27b..89e1ce7165db 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2579,9 +2579,18 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; + + if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_errorscrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; + + // error scrub stats + spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* @@ -2592,8 +2601,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && + scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); + memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ @@ -2616,6 +2627,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + /* error scrub data stored on disk */ + ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; + ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; + ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; + ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; + ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; + ps->pss_error_scrub_to_be_examined = + scn->errorscrub_phys.dep_to_examine; + + /* error scrub data not stored on disk */ + ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; + return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 22e644f75f95..85f1c2859d4d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1685,6 +1685,45 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * poolname name of the pool + * scan_type scan func (pool_scan_func_t) + * scan_command scrub pause/resume flag (pool_scrub_cmd_t) + */ +static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { + {"scan_type", DATA_TYPE_UINT64, 0}, + {"scan_command", DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + pool_scan_func_t scan_type = + (pool_scan_func_t)fnvlist_lookup_uint64(innvl, "scan_type"); + pool_scrub_cmd_t scan_cmd = + (pool_scrub_cmd_t)fnvlist_lookup_uint64(innvl, "scan_command"); + + if (scan_cmd >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + if (scan_cmd == POOL_SCRUB_PAUSE) { + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + } else if (scan_type == POOL_SCAN_NONE) { + error = spa_scan_stop(spa); + } else { + error = spa_scan(spa, scan_type); + } + + spa_close(spa, FTAG); + return (error); +} + static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { @@ -7217,6 +7256,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); + zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, + zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_NONE, B_TRUE, B_TRUE, + zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e2137ac596d9..c691e88bfefb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -478,7 +478,9 @@ tags = ['functional', 'cli_root', 'zpool_resilver'] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', - 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', + 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', + 'zpool_error_scrub_003_pos'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index a1dfaefd7105..c661718a296c 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Test the nvpair inputs for the non-legacy zfs ioctl commands. @@ -688,6 +689,17 @@ test_vdev_trim(const char *pool) nvlist_free(required); } +/* Test with invalid values */ +static void +test_scrub(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + fnvlist_add_uint64(required, "scan_type", POOL_SCAN_FUNCS + 1); + fnvlist_add_uint64(required, "scan_command", POOL_SCRUB_FLAGS_END + 1); + IOC_INPUT_TEST(ZFS_IOC_POOL_SCRUB, pool, required, NULL, EINVAL); + nvlist_free(required); +} + static int zfs_destroy(const char *dataset) { @@ -868,6 +880,8 @@ zfs_ioc_input_tests(const char *pool) test_set_bootenv(pool); test_get_bootenv(pool); + test_scrub(pool); + /* * cleanup */ @@ -1022,6 +1036,7 @@ validate_ioc_values(void) CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); + CHECK(ZFS_IOC_BASE + 87 == ZFS_IOC_POOL_SCRUB); CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 02e6a500a71a..7d6518707345 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1968,6 +1968,12 @@ function is_pool_scrubbing #pool check_pool_status "$1" "scan" "scrub in progress since " $2 } +function is_pool_error_scrubbing #pool +{ + check_pool_status "$1" "scrub" "error scrub in progress since " $2 + return $? +} + function is_pool_scrubbed #pool { check_pool_status "$1" "scan" "scrub repaired" $2 @@ -1978,11 +1984,23 @@ function is_pool_scrub_stopped #pool check_pool_status "$1" "scan" "scrub canceled" $2 } +function is_pool_error_scrub_stopped #pool +{ + check_pool_status "$1" "scrub" "error scrub canceled on " $2 + return $? +} + function is_pool_scrub_paused #pool { check_pool_status "$1" "scan" "scrub paused since " $2 } +function is_pool_error_scrub_paused #pool +{ + check_pool_status "$1" "scrub" "error scrub paused since " $2 + return $? +} + function is_pool_removing #pool { check_pool_status "$1" "remove" "in progress since " diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 9299a4ca9b47..9d9674f76fad 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1150,6 +1150,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh new file mode 100755 index 000000000000..43ecf31021e5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2022, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify scrub -e, -p, and -s show the right status. +# +# STRATEGY: +# 1. Create a pool and create a 10MB file in it. +# 2. Start a error scrub (-e) and verify it's doing a scrub. +# 3. Pause error scrub (-p) and verify it's paused. +# 4. Try to pause a paused error scrub (-p) and make sure that fails. +# 5. Resume the paused error scrub and verify again it's doing a scrub. +# 6. Verify zpool scrub -s succeed when the system is error scrubbing. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must zinject -c all + rm -f /$TESTPOOL/10m_file +} + +log_onexit cleanup + +log_assert "Verify scrub -e, -p, and -s show the right status." + +log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file + +# create some error blocks +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true + +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_must zpool scrub -p $TESTPOOL +log_must is_pool_error_scrub_paused $TESTPOOL true +log_mustnot zpool scrub -p $TESTPOOL +log_must is_pool_error_scrub_paused $TESTPOOL true +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_error_scrub_stopped $TESTPOOL true + +log_pass "Verified scrub -e, -p, and -s show expected status." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh new file mode 100755 index 000000000000..207bef0a4001 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh @@ -0,0 +1,99 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2022, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify regular scrub and error scrub can't run at the same time. +# +# STRATEGY: +# 1. Create a pool and create a 10MB file in it. +# 2. Start a scrub and verify it's doing a scrub. +# 3. Start a error scrub (-e) and verify it fails. +# 4. Pause scrub (-p) and verify it's paused. +# 5. Start a error scrub (-e) verify it fails again. +# 6. Resume the paused scrub, verify it and cancel it. +# 7. Start a error scrub (-e) and verify it's doing error scrub. +# 8. Start a scrub and verify it fails. +# 9. Cancel error scrub (-e) and verify it is canceled. +# 10. Start scrub, verify it, cancel it and verify it. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must zinject -c all + rm -f /$TESTPOOL/10m_file +} + +log_onexit cleanup + +log_assert "Verify regular scrub and error scrub can't run at the same time." + +log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file + +# create some error blocks before error scrub is requested. +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_mustnot zpool scrub -e $TESTPOOL +log_must zpool scrub -p $TESTPOOL +log_must is_pool_scrub_paused $TESTPOOL true +log_mustnot zpool scrub -e $TESTPOOL +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_scrub_stopped $TESTPOOL true + +# create some error blocks before error scrub is requested. +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_mustnot zpool scrub $TESTPOOL +log_must zpool scrub -s $TESTPOOL +log_must is_pool_error_scrub_stopped $TESTPOOL true + +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_scrub_stopped $TESTPOOL true + +log_pass "Verified regular scrub and error scrub can't run at the same time." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh new file mode 100755 index 000000000000..d3ab6309d44d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2022, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify error scrub clears the errorlog, if errors no longer exist. +# +# STRATEGY: +# 1. Create a pool and create file in it. +# 2. Zinject errors and read using dd to log errors to disk. +# 3. Make sure file name is mentioned in the list of error files. +# 4. Start error scrub and wait for it finish. +# 5. Check scrub ran and errors are still reported. +# 6. Clear corruption and error scrub again. +# 7. Check scrub ran and errors are cleared. +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + rm -f /$TESTPOOL2/$TESTFILE0 + destroy_pool $TESTPOOL2 +} + +log_onexit cleanup + +log_assert "Verify error scrub clears the errorlog, if errors no longer exist." + +truncate -s $MINVDEVSIZE $TESTDIR/vdev_a +log_must zpool create -f -O primarycache=none $TESTPOOL2 $TESTDIR/vdev_a +log_must zfs create $TESTPOOL2/$TESTFS1 +typeset file=/$TESTPOOL2/$TESTFS1/$TESTFILE0 +log_must dd if=/dev/urandom of=$file bs=2M count=10 + +lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')" +for i in {1..3}; do + log_must zfs snap $lastfs@snap$i + log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i + lastfs="$(zfs list -r $TESTPOOL2/clone$i | tail -1 | awk '{print $1}')" +done + +log_must zinject -t data -e checksum -f 100 -a $file +dd if=$file of=/dev/null bs=2M count=10 + +# Important: sync error log to disk +log_must sync_pool $TESTPOOL2 + +# Check reported errors +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +# Check errors are reported if corruption persists +log_must zpool scrub -e -w $TESTPOOL2 +log_must eval "zpool status -v | grep 'error blocks'" +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +# Check errors are cleared +log_must zinject -c all +log_must zpool scrub -e -w $TESTPOOL2 +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v | grep 'error blocks'" +log_mustnot eval "zpool status -v | grep '$TESTFILE0'" + + +log_pass "Verify error scrub clears the errorlog, if errors no longer exist."