diff --git a/cmd/arc_summary b/cmd/arc_summary index 9c69ec4f8ccc..100fb1987a8b 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -793,18 +793,27 @@ def section_dmu(kstats_dict): zfetch_stats = isolate_section('zfetchstats', kstats_dict) - zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) + zfetch_access_total = int(zfetch_stats['hits']) +\ + int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\ + int(zfetch_stats['past']) + int(zfetch_stats['misses']) prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total)) prt_i2('Stream hits:', f_perc(zfetch_stats['hits'], zfetch_access_total), f_hits(zfetch_stats['hits'])) + future = int(zfetch_stats['future']) + int(zfetch_stats['stride']) + prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total), + f_hits(future)) + prt_i2('Hits behind stream:', + f_perc(zfetch_stats['past'], zfetch_access_total), + f_hits(zfetch_stats['past'])) prt_i2('Stream misses:', f_perc(zfetch_stats['misses'], zfetch_access_total), f_hits(zfetch_stats['misses'])) prt_i2('Streams limit reached:', f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']), f_hits(zfetch_stats['max_streams'])) + prt_i1('Stream strides:', f_hits(zfetch_stats['stride'])) prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued'])) print() diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index a11b6d0b7fac..07d3d8af9980 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -208,6 +209,37 @@ type_to_name(uint64_t type) } } +struct errstr { + int err; + const char *str; +}; +static const struct errstr errstrtable[] = { + { EIO, "io" }, + { ECKSUM, "checksum" }, + { EINVAL, "decompress" }, + { EACCES, "decrypt" }, + { ENXIO, "nxio" }, + { ECHILD, "dtl" }, + { EILSEQ, "corrupt" }, + { 0, NULL }, +}; + +static int +str_to_err(const char *str) +{ + for (int i = 0; errstrtable[i].str != NULL; i++) + if (strcasecmp(errstrtable[i].str, str) == 0) + return (errstrtable[i].err); + return (-1); +} +static const char * +err_to_str(int err) +{ + for (int i = 0; errstrtable[i].str != NULL; i++) + if (errstrtable[i].err == err) + return (errstrtable[i].str); + return ("[unknown]"); +} /* * Print usage message. @@ -233,7 +265,7 @@ usage(void) "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" - "\t\t[-T ] [-f frequency] pool\n\n" + "\t\t[-T ] [-f frequency] pool\n\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" @@ -392,6 +424,10 @@ static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { + static const char *iotypestr[] = { + "null", "read", "write", "free", "claim", "ioctl", "trim", "all", + }; + int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') @@ -401,14 +437,21 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); - (void) printf("--- --------------- ----------------\n"); + (void) printf("%3s %-15s %-16s %-5s %-10s %-9s\n", + "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ"); + (void) printf( + "--- --------------- ---------------- " + "----- ---------- ---------\n"); } *count += 1; - (void) printf("%3d %-15s %llx\n", id, pool, - (u_longlong_t)record->zi_guid); + double freq = record->zi_freq == 0 ? 100.0f : + (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f; + + (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%%\n", id, pool, + (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype], + err_to_str(record->zi_error), freq); return (0); } @@ -842,24 +885,12 @@ main(int argc, char **argv) } break; case 'e': - if (strcasecmp(optarg, "io") == 0) { - error = EIO; - } else if (strcasecmp(optarg, "checksum") == 0) { - error = ECKSUM; - } else if (strcasecmp(optarg, "decompress") == 0) { - error = EINVAL; - } else if (strcasecmp(optarg, "decrypt") == 0) { - error = EACCES; - } else if (strcasecmp(optarg, "nxio") == 0) { - error = ENXIO; - } else if (strcasecmp(optarg, "dtl") == 0) { - error = ECHILD; - } else if (strcasecmp(optarg, "corrupt") == 0) { - error = EILSEQ; - } else { + error = str_to_err(optarg); + if (error < 0) { (void) fprintf(stderr, "invalid error type " - "'%s': must be 'io', 'checksum' or " - "'nxio'\n", optarg); + "'%s': must be one of: io decompress " + "decrypt nxio dtl corrupt\n", + optarg); usage(); libzfs_fini(g_zfs); return (1); @@ -947,12 +978,14 @@ main(int argc, char **argv) io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "ioctl") == 0) { + io_type = ZIO_TYPE_IOCTL; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " - "'claim' or 'all'\n", optarg); + "'claim', 'ioctl' or 'all'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index c85a5f285154..d670cd1afeb1 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2289,7 +2289,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; @@ -2299,8 +2298,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_initialize_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: @@ -2340,7 +2339,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; @@ -2349,8 +2347,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_trim_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: @@ -10793,11 +10791,10 @@ zpool_do_get(int argc, char **argv) } } else { /* - * The first arg isn't a pool name, + * The first arg isn't the name of a valid pool. */ - fprintf(stderr, gettext("missing pool name.\n")); - fprintf(stderr, "\n"); - usage(B_FALSE); + fprintf(stderr, gettext("Cannot get properties of %s: " + "no such pool available.\n"), argv[0]); return (1); } diff --git a/config/Substfiles.am b/config/Substfiles.am index 38e870b2f501..18422bf64301 100644 --- a/config/Substfiles.am +++ b/config/Substfiles.am @@ -18,6 +18,7 @@ subst_sed_cmd = \ -e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \ -e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \ -e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \ + -e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \ -e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \ -e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \ -e 's|@PYTHON[@]|$(PYTHON)|g' \ diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index bb5903b313eb..15dbe1c7dff0 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ (void) blk_mq_alloc_tag_set(&tag_set); return BLK_STS_OK; ], []) + ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [ + #include + #include + ], [ + struct request rq = {0}; + struct blk_mq_hw_ctx *hctx = NULL; + rq.mq_hctx = hctx; + ], []) ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ @@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ ZFS_LINUX_TEST_RESULT([blk_mq], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request]) + ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request]) + ], [ + AC_MSG_RESULT(no) + ]) ], [ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index c5a353ca9203..b6ce1e1cf083 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # bdev_file_open_by_path() replaced bdev_open_by_path(), +dnl # and returns struct file* +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [ + ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [ + #include + #include + ], [ + struct file *file __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + file = bdev_file_open_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ @@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ [bdev_open_by_path() exists]) AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bdev_file_open_by_path() exists]) + ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [ + AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1, + [bdev_file_open_by_path() exists]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) ]) ]) ]) @@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # +dnl # bdev_release() now private, but because bdev_file_open_by_path() returns +dnl # struct file*, we can just use fput(). So the blkdev_put test no longer +dnl # fails if not found. +dnl # + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) @@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_DEFINE(HAVE_BDEV_RELEASE, 1, [bdev_release() exists]) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_RESULT(no) ]) ]) ]) @@ -523,12 +561,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ ]) dnl # -dnl # 5.19 API: blkdev_issue_secure_erase() -dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) -dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # TRIM support: discard and secure erase. We make use of asynchronous +dnl # functions when available. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ - ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ +dnl # 3.10: +dnl # sync discard: blkdev_issue_discard(..., 0) +dnl # sync erase: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # async discard: [not available] +dnl # async erase: [not available] +dnl # +dnl # 4.7: +dnl # sync discard: blkdev_issue_discard(..., 0) +dnl # sync erase: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # async discard: __blkdev_issue_discard(..., 0) +dnl # async erase: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # +dnl # 5.19: +dnl # sync discard: blkdev_issue_discard(...) +dnl # sync erase: blkdev_issue_secure_erase(...) +dnl # async discard: __blkdev_issue_discard(...) +dnl # async erase: [not available] +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [ + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [ #include ],[ struct block_device *bdev = NULL; @@ -536,10 +591,33 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ sector_t nr_sects = 0; int error __attribute__ ((unused)); - error = blkdev_issue_secure_erase(bdev, + error = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + int error __attribute__ ((unused)); + error = blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, &biop); + ]) ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ #include ],[ @@ -553,22 +631,52 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ error = __blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL, flags, &biop); ]) - - ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ + ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; - unsigned long flags = 0; int error __attribute__ ((unused)); - error = blkdev_issue_discard(bdev, - sector, nr_sects, GFP_KERNEL, flags); + error = blkdev_issue_secure_erase(bdev, + sector, nr_sects, GFP_KERNEL); ]) ]) -AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [ + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1, + [blkdev_issue_discard() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1, + [blkdev_issue_discard(flags) is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1, + [__blkdev_issue_discard() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1, + [__blkdev_issue_discard(flags) is available]) + ],[ + AC_MSG_RESULT(no) + ]) AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [ AC_MSG_RESULT(yes) @@ -576,24 +684,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ [blkdev_issue_secure_erase() is available]) ],[ AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, - [__blkdev_issue_discard() is available]) - ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) - ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) - ]) - ]) ]) ]) @@ -645,6 +735,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH + ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE @@ -657,7 +748,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME - ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE @@ -678,7 +769,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS - ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 index 4d20dd45c4a1..9813ad2fb3f3 100644 --- a/config/kernel-make-request-fn.m4 +++ b/config/kernel-make-request-fn.m4 @@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ disk = blk_alloc_disk(NUMA_NO_NODE); ]) + ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [ + #include + ],[ + struct queue_limits *lim = NULL; + struct gendisk *disk __attribute__ ((unused)); + disk = blk_alloc_disk(lim, NUMA_NO_NODE); + ]) + ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [ #include ],[ @@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ ], [ AC_MSG_RESULT(no) ]) + + dnl # + dnl # Linux 6.9 API Change: + dnl # blk_alloc_queue() takes a nullable queue_limits arg. + dnl # + AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args]) + ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args]) + + dnl # + dnl # 5.20 API change, + dnl # Removed blk_cleanup_disk(), put_disk() should be used. + dnl # + AC_MSG_CHECKING([whether blk_cleanup_disk() exists]) + ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1, + [blk_cleanup_disk() exists]) + ], [ + AC_MSG_RESULT(no) + ]) + ], [ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 5f36569fe25b..bb5a85d815d1 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - alpine) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - *) DEFAULT_INIT_SHELL="/bin/sh" ;; + gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + IS_SYSV_RC=false ;; + *) DEFAULT_INIT_SHELL=/bin/sh + IS_SYSV_RC=true ;; esac AC_MSG_RESULT([$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SHELL) + AC_SUBST(IS_SYSV_RC) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], diff --git a/etc/init.d/README.md b/etc/init.d/README.md index 2de05042ce63..da780fdc1222 100644 --- a/etc/init.d/README.md +++ b/etc/init.d/README.md @@ -7,11 +7,7 @@ DESCRIPTION They have been tested successfully on: - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie - * Ubuntu Trusty - * CentOS 6.0 - * CentOS 6.6 + * Debian GNU/Linux Bookworm * Gentoo SUPPORT diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in index a9a0604f81ac..ff169eb96d86 100755 --- a/etc/init.d/zfs-import.in +++ b/etc/init.d/zfs-import.in @@ -307,7 +307,7 @@ do_start() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-load-key.in b/etc/init.d/zfs-load-key.in index 53c7766b793a..27dfeeb0bcc5 100755 --- a/etc/init.d/zfs-load-key.in +++ b/etc/init.d/zfs-load-key.in @@ -104,7 +104,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in index a0825f19fcdd..6a3ca5f86908 100755 --- a/etc/init.d/zfs-mount.in +++ b/etc/init.d/zfs-mount.in @@ -114,7 +114,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in index 88978071cbf6..06c59c620b75 100755 --- a/etc/init.d/zfs-share.in +++ b/etc/init.d/zfs-share.in @@ -57,7 +57,8 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in index e9cf8867403c..3d40600cea5d 100755 --- a/etc/init.d/zfs-zed.in +++ b/etc/init.d/zfs-zed.in @@ -93,7 +93,8 @@ do_reload() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 551f75f42a20..d4103c2f062a 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -4,8 +4,6 @@ noinst_HEADERS = \ \ %D%/spl/acl/acl_common.h \ \ - %D%/spl/rpc/xdr.h \ - \ %D%/spl/sys/ia32/asm_linkage.h \ \ %D%/spl/sys/acl.h \ diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h deleted file mode 100644 index c98466e9d16a..000000000000 --- a/include/os/freebsd/spl/rpc/xdr.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Sun RPC is a product of Sun Microsystems, Inc. and is provided for - * unrestricted use provided that this legend is included on all tape - * media and as a part of the software program in whole or part. Users - * may copy or modify Sun RPC without charge, but are not authorized - * to license or distribute it to anyone else except as part of a product or - * program developed by the user. - * - * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE - * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. - * - * Sun RPC is provided with no support and without any obligation on the - * part of Sun Microsystems, Inc. to assist in its use, correction, - * modification or enhancement. - * - * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE - * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC - * OR ANY PART THEREOF. - * - * In no event will Sun Microsystems, Inc. be liable for any lost revenue - * or profits or other special, indirect and consequential damages, even if - * Sun has been advised of the possibility of such damages. - * - * Sun Microsystems, Inc. - * 2550 Garcia Avenue - * Mountain View, California 94043 - */ - -#ifndef _OPENSOLARIS_RPC_XDR_H_ -#define _OPENSOLARIS_RPC_XDR_H_ - -#include -#include_next - -#if !defined(_KERNEL) && !defined(_STANDALONE) - -#include - -/* - * Taken from sys/xdr/xdr_mem.c. - * - * FreeBSD's userland XDR doesn't implement control method (only the kernel), - * but OpenSolaris nvpair still depend on it, so we have to implement it here. - */ -static __inline bool_t -xdrmem_control(XDR *xdrs, int request, void *info) -{ - xdr_bytesrec *xptr; - - switch (request) { - case XDR_GET_BYTES_AVAIL: - xptr = (xdr_bytesrec *)info; - xptr->xc_is_last_record = TRUE; - xptr->xc_num_avail = xdrs->x_handy; - return (TRUE); - default: - assert(!"unexpected request"); - } - return (FALSE); -} - -#undef XDR_CONTROL -#define XDR_CONTROL(xdrs, req, op) \ - (((xdrs)->x_ops->x_control == NULL) ? \ - xdrmem_control((xdrs), (req), (op)) : \ - (*(xdrs)->x_ops->x_control)(xdrs, req, op)) - -#endif /* !_KERNEL && !_STANDALONE */ - -#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 51c27132b4ef..332569efe361 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -47,6 +47,7 @@ kernel_sys_HEADERS = \ kernel_spl_rpcdir = $(kerneldir)/spl/rpc kernel_spl_rpc_HEADERS = \ + %D%/spl/rpc/types.h \ %D%/spl/rpc/xdr.h kernel_spl_sysdir = $(kerneldir)/spl/sys diff --git a/include/os/linux/spl/rpc/types.h b/include/os/linux/spl/rpc/types.h new file mode 100644 index 000000000000..5bbb4f2dec46 --- /dev/null +++ b/include/os/linux/spl/rpc/types.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008 Sun Microsystems, Inc. + * Written by Ricardo Correia + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H + +#include + +/* Just enough to support rpc/xdr.h */ + +typedef int bool_t; + +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h index 05aed7cb81ce..606566113e1c 100644 --- a/include/os/linux/spl/rpc/xdr.h +++ b/include/os/linux/spl/rpc/xdr.h @@ -24,8 +24,6 @@ #include #include -typedef int bool_t; - /* * XDR enums and types. */ diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index f00e13cf03a6..322472fb1ae2 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -45,18 +45,24 @@ typedef struct zfetch { int zf_numstreams; /* number of zstream_t's */ } zfetch_t; +typedef struct zsrange { + uint16_t start; + uint16_t end; +} zsrange_t; + +#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */ + typedef struct zstream { + list_node_t zs_node; /* link for zf_stream */ uint64_t zs_blkid; /* expect next access at this blkid */ + uint_t zs_atime; /* time last prefetch issued */ + zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */ unsigned int zs_pf_dist; /* data prefetch distance in bytes */ unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */ uint64_t zs_pf_start; /* first data block to prefetch */ uint64_t zs_pf_end; /* data block to prefetch up to */ uint64_t zs_ipf_start; /* first data block to prefetch L1 */ uint64_t zs_ipf_end; /* data block to prefetch L1 up to */ - - list_node_t zs_node; /* link for zf_stream */ - hrtime_t zs_atime; /* time last prefetch issued */ - zfetch_t *zs_fetch; /* parent fetch */ boolean_t zs_missed; /* stream saw cache misses */ boolean_t zs_more; /* need more distant prefetch */ zfs_refcount_t zs_callers; /* number of pending callers */ @@ -74,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t); -void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t); +void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, boolean_t); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index f39ebf031cea..2a93f7c680bc 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -455,7 +455,7 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; /* - * Vdev properties for tuning ZED + * Vdev properties for tuning ZED or zfsd */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index d563edd7ba59..e54456d3472b 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -132,7 +132,7 @@ typedef struct zap_leaf_phys { * with the ZAP_LEAF_CHUNK() macro. */ - uint16_t l_hash[1]; + uint16_t l_hash[]; } zap_leaf_phys_t; typedef union zap_leaf_chunk { diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 1c0a44059d24..4b3726d7eec4 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -153,7 +153,7 @@ enum zio_stage { ZIO_STAGE_READY = 1 << 20, /* RWFCIT */ ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--IT */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW---T */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--IT */ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--IT */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ @@ -261,8 +261,7 @@ enum zio_stage { #define ZIO_IOCTL_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_VDEV_IO_START | \ - ZIO_STAGE_VDEV_IO_ASSESS) + ZIO_VDEV_IO_STAGES) #define ZIO_TRIM_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index de5d7dbcb97f..6a8a815cb0c0 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1900,7 +1900,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " @@ -1962,7 +1963,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), diff --git a/man/Makefile.am b/man/Makefile.am index 45156571eec3..43bb014ddd32 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -62,7 +62,6 @@ dist_man_MANS = \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ - %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ @@ -115,7 +114,8 @@ endif nodist_man_MANS = \ %D%/man8/zed.8 \ - %D%/man8/zfs-mount-generator.8 + %D%/man8/zfs-mount-generator.8 \ + %D%/man8/zfs_prepare_disk.8 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index cacb214d1dc1..6088ebc7ef35 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -564,6 +564,10 @@ However, this is limited by Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . +.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint +Log2 fraction of holes in speculative prefetch stream allowed for it to +proceed. +. .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to @@ -578,6 +582,13 @@ Max bytes to prefetch per stream. .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . +.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint +Requests within this byte distance from the current prefetch stream position +are considered parts of the stream, reordered due to parallel processing. +Such requests do not advance the stream position immediately unless +.Sy zfetch_hole_shift +fill threshold is reached, but saved to fill holes in the stream later. +. .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint Max number of streams per zfetch (prefetch streams per file). . @@ -2387,6 +2398,13 @@ The number of requests which can be handled concurrently is controlled by is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . +.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint +Number of zvol taskqs. +If +.Sy 0 +(the default) then scaling is done internally to prefer 6 threads per taskq. +This only applies on Linux. +. .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 3d3ebc072915..5ec37df179de 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used. The value of these properties do not persist across vdev replacement. For this reason, it is advisable to set the property on the top-level vdev - not on the leaf vdev itself. -The default values are 10 errors in 600 seconds. +The default values for +.Sy OpenZFS on Linux +are 10 errors in 600 seconds. +For +.Sy OpenZFS on FreeBSD +defaults see +.Xr zfsd 8 . .It Sy comment A text comment up to 8192 characters long .It Sy bootsize diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index b692f12130a8..817dcb7fe32a 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -19,10 +19,11 @@ .\" CDDL HEADER END .\" .\" Copyright 2013 Darik Horn . All rights reserved. +.\" Copyright (c) 2024, Klara Inc. .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" -.Dd May 26, 2021 +.Dd April 4, 2024 .Dt ZINJECT 8 .Os . @@ -257,6 +258,7 @@ Run for this many seconds before reporting failure. .It Fl T Ar failure Set the failure type to one of .Sy all , +.Sy ioctl , .Sy claim , .Sy free , .Sy read , diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index a7a9e33442da..12331b7b2a2d 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -404,7 +404,7 @@ ZIO_STAGE_DVA_CLAIM:0x00080000:---C-- ZIO_STAGE_READY:0x00100000:RWFCIT ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT -ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T +ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--IT ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----- diff --git a/module/Makefile.bsd b/module/Makefile.bsd index e9ad69fc50a2..d9d31564d090 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -82,12 +82,9 @@ CFLAGS+= -DBITS_PER_LONG=64 SRCS= vnode_if.h device_if.h bus_if.h -# avl +#avl SRCS+= avl.c -# icp -SRCS+= edonr.c - #icp/algs/blake3 SRCS+= blake3.c \ blake3_generic.c \ @@ -107,9 +104,12 @@ SRCS+= blake3_avx2.S \ blake3_sse2.S \ blake3_sse41.S +#icp/algs/edonr +SRCS+= edonr.c + #icp/algs/sha2 -SRCS+= sha2_generic.c \ - sha256_impl.c \ +SRCS+= sha256_impl.c \ + sha2_generic.c \ sha512_impl.c #icp/asm-arm/sha2 @@ -122,8 +122,8 @@ SRCS+= sha256-armv8.S \ #icp/asm-ppc64/sha2 SRCS+= sha256-p8.S \ - sha512-p8.S \ sha256-ppc.S \ + sha512-p8.S \ sha512-ppc.S #icp/asm-x86_64/sha2 @@ -157,10 +157,10 @@ SRCS+= lapi.c \ lzio.c #nvpair -SRCS+= nvpair.c \ - fnvpair.c \ - nvpair_alloc_spl.c \ - nvpair_alloc_fixed.c +SRCS+= fnvpair.c \ + nvpair.c \ + nvpair_alloc_fixed.c \ + nvpair_alloc_spl.c #os/freebsd/spl SRCS+= acl_common.c \ @@ -184,7 +184,6 @@ SRCS+= acl_common.c \ spl_zlib.c \ spl_zone.c - .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ ${MACHINE_ARCH} == "powerpcspe" || ${MACHINE_ARCH} == "arm" SRCS+= spl_atomic.c @@ -207,6 +206,7 @@ SRCS+= abd_os.c \ zfs_ctldir.c \ zfs_debug.c \ zfs_dir.c \ + zfs_file_os.c \ zfs_ioctl_compat.c \ zfs_ioctl_os.c \ zfs_racct.c \ @@ -217,19 +217,20 @@ SRCS+= abd_os.c \ zvol_os.c #unicode -SRCS+= uconv.c \ - u8_textprep.c +SRCS+= u8_textprep.c \ + uconv.c #zcommon -SRCS+= zfeature_common.c \ +SRCS+= cityhash.c \ + zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ - zfs_fletcher.c \ zfs_fletcher_avx512.c \ + zfs_fletcher.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c \ - zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ + zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ @@ -243,14 +244,13 @@ SRCS+= abd.c \ blkptr.c \ bplist.c \ bpobj.c \ + bptree.c \ + bqueue.c \ brt.c \ btree.c \ - cityhash.c \ + dataset_kstats.c \ dbuf.c \ dbuf_stats.c \ - bptree.c \ - bqueue.c \ - dataset_kstats.c \ ddt.c \ ddt_stats.c \ ddt_zap.c \ @@ -266,13 +266,13 @@ SRCS+= abd.c \ dmu_zfetch.c \ dnode.c \ dnode_sync.c \ + dsl_bookmark.c \ + dsl_crypt.c \ dsl_dataset.c \ dsl_deadlist.c \ dsl_deleg.c \ - dsl_bookmark.c \ - dsl_dir.c \ - dsl_crypt.c \ dsl_destroy.c \ + dsl_dir.c \ dsl_pool.c \ dsl_prop.c \ dsl_scan.c \ @@ -281,9 +281,9 @@ SRCS+= abd.c \ edonr_zfs.c \ fm.c \ gzip.c \ - lzjb.c \ lz4.c \ lz4_zfs.c \ + lzjb.c \ metaslab.c \ mmp.c \ multilist.c \ @@ -296,6 +296,8 @@ SRCS+= abd.c \ sha2_zfs.c \ skein_zfs.c \ spa.c \ + space_map.c \ + space_reftree.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ @@ -303,16 +305,14 @@ SRCS+= abd.c \ spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ - space_map.c \ - space_reftree.c \ txg.c \ uberblock.c \ unique.c \ vdev.c \ vdev_draid.c \ vdev_draid_rand.c \ - vdev_indirect.c \ vdev_indirect_births.c \ + vdev_indirect.c \ vdev_indirect_mapping.c \ vdev_initialize.c \ vdev_label.c \ @@ -320,11 +320,11 @@ SRCS+= abd.c \ vdev_missing.c \ vdev_queue.c \ vdev_raidz.c \ - vdev_raidz_math.c \ - vdev_raidz_math_scalar.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_rebuild.c \ @@ -343,7 +343,6 @@ SRCS+= abd.c \ zfeature.c \ zfs_byteswap.c \ zfs_chksum.c \ - zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ zfs_impl.c \ @@ -367,30 +366,36 @@ SRCS+= abd.c \ zvol.c #zstd -SRCS+= zfs_zstd.c \ - entropy_common.c \ +SRCS+= zfs_zstd.c + +#zstd/common +SRCS+= entropy_common.c \ error_private.c \ - fse_compress.c \ fse_decompress.c \ - hist.c \ - huf_compress.c \ - huf_decompress.c \ pool.c \ xxhash.c \ zstd_common.c \ + +#zstd/compress +SRCS+= fse_compress.c \ + hist.c \ + huf_compress.c \ zstd_compress.c \ zstd_compress_literals.c \ zstd_compress_sequences.c \ zstd_compress_superblock.c \ - zstd_ddict.c \ - zstd_decompress.c \ - zstd_decompress_block.c \ zstd_double_fast.c \ zstd_fast.c \ zstd_lazy.c \ zstd_ldm.c \ zstd_opt.c +#zstd/decompress +SRCS+= huf_decompress.c \ + zstd_ddict.c \ + zstd_decompress_block.c \ + zstd_decompress.c + beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index d9449e47e87a..887f7d32df4a 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c index 6b77524181db..e1773da5d173 100644 --- a/module/os/linux/spl/spl-xdr.c +++ b/module/os/linux/spl/spl-xdr.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /* diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index df5fa067797a..a560bca918a8 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -45,15 +45,25 @@ /* * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying * block_device. Since it carries the block_device inside, its convenient to - * just use the handle as a proxy. For pre-6.8, we just emulate this with - * a cast, since we don't need any of the other fields inside the handle. + * just use the handle as a proxy. + * + * Linux 6.9.x uses a file for the same purpose. + * + * For pre-6.8, we just emulate this with a cast, since we don't need any of + * the other fields inside the handle. */ -#ifdef HAVE_BDEV_OPEN_BY_PATH +#if defined(HAVE_BDEV_OPEN_BY_PATH) typedef struct bdev_handle zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((bdh)->bdev) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) +#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) +typedef struct file zfs_bdev_handle_t; +#define BDH_BDEV(bdh) (file_bdev(bdh)) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) #else typedef void zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((struct block_device *)bdh) @@ -242,7 +252,9 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) { vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); -#if defined(HAVE_BDEV_OPEN_BY_PATH) +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) + return (bdev_file_open_by_path(path, bmode, holder, NULL)); +#elif defined(HAVE_BDEV_OPEN_BY_PATH) return (bdev_open_by_path(path, bmode, holder, NULL)); #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) return (blkdev_get_by_path(path, bmode, holder, NULL)); @@ -258,8 +270,10 @@ vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) return (bdev_release(bdh)); #elif defined(HAVE_BLKDEV_PUT_HOLDER) return (blkdev_put(BDH_BDEV(bdh), holder)); -#else +#elif defined(HAVE_BLKDEV_PUT) return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); +#else + fput(bdh); #endif } @@ -1238,8 +1252,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ - defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) { zio_t *zio = bio->bi_private; @@ -1254,54 +1266,99 @@ BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) zio_interrupt(zio); } +/* + * Wrappers for the different secure erase and discard APIs. We use async + * when available; in this case, *biop is set to the last bio in the chain. + */ static int -vdev_issue_discard_trim(zio_t *zio, unsigned long flags) +vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) { - int ret; - struct bio *bio = NULL; + *biop = NULL; + int error; -#if defined(BLKDEV_DISCARD_SECURE) - ret = - __blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + error = blkdev_issue_secure_erase(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); #else - (void) flags; - ret = - __blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#error "unsupported kernel" #endif - if (!ret && bio) { - bio->bi_private = zio; - bio->bi_end_io = vdev_disk_discard_end_io; - vdev_submit_bio(bio); - } - return (ret); + + return (error); } + +static int +vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) +{ + *biop = NULL; + int error; + +#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); +#else +#error "unsupported kernel" #endif + return (error); +} + +/* + * Entry point for TRIM ops. This calls the right wrapper for secure erase or + * discard, and then does the appropriate finishing work for error vs success + * and async vs sync. + */ static int vdev_disk_io_trim(zio_t *zio) { - unsigned long trim_flags = 0; - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - return (-blkdev_issue_secure_erase( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); -#elif defined(BLKDEV_DISCARD_SECURE) - trim_flags |= BLKDEV_DISCARD_SECURE; -#endif + int error; + struct bio *bio; + + zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; + sector_t sector = zio->io_offset >> 9; + sector_t nsects = zio->io_size >> 9; + + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); + else + error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); + + if (error != 0) + return (SET_ERROR(-error)); + + if (bio == NULL) { + /* + * This was a synchronous op that completed successfully, so + * return it to ZFS immediately. + */ + zio_interrupt(zio); + } else { + /* + * This was an asynchronous op; set up completion callback and + * issue it. + */ + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); } -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ - defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) - return (vdev_issue_discard_trim(zio, trim_flags)); -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) - return (-blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); -#else -#error "Unsupported kernel" -#endif + + return (0); } int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; @@ -1376,14 +1433,12 @@ vdev_disk_io_start(zio_t *zio) return; case ZIO_TYPE_TRIM: - zio->io_error = vdev_disk_io_trim(zio); + error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) - zio_interrupt(zio); -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) - zio_interrupt(zio); -#endif + if (error) { + zio->io_error = error; + zio_execute(zio); + } return; case ZIO_TYPE_READ: diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 26cc63d426eb..4b960daf89ee 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif @@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE; static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif +static unsigned int zvol_num_taskqs = 0; + #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -114,7 +123,11 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -static taskq_t *zvol_taskq; +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; +static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -532,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, } zv_request_task_t *task; + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t blk_mq_hw_queue = 0; + uint_t tq_idx; + uint_t taskq_hash; +#ifdef HAVE_BLK_MQ + if (rq) +#ifdef HAVE_BLK_MQ_RQ_HCTX + blk_mq_hw_queue = rq->mq_hctx->queue_num; +#else + blk_mq_hw_queue = + rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; +#endif +#endif + taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue, 0); + tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { @@ -601,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { @@ -609,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_write(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } @@ -631,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_read(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } @@ -1053,6 +1082,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) if (zso->zvo_disk == NULL) return (1); + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + return (1); + } + + zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else @@ -1103,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv) } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + zso->zvo_disk = disk; + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { @@ -1256,7 +1306,7 @@ zvol_os_free(zvol_state_t *zv) del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ - defined(HAVE_BLK_ALLOC_DISK) + (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else @@ -1577,8 +1627,40 @@ zvol_init(void) zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } + /* + * Use atleast 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + num_online_cpus() / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + uint_t per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1598,11 +1680,22 @@ zvol_init(void) 1024); } #endif - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, - zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - unregister_blkdev(zvol_major, ZVOL_DRIVER); - return (-ENOMEM); + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (-ENOMEM); + } } zvol_init_impl(); @@ -1613,9 +1706,22 @@ zvol_init(void) void zvol_fini(void) { + zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - taskq_destroy(zvol_taskq); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } + ida_destroy(&zvol_ida); } @@ -1636,6 +1742,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); +module_param(zvol_num_taskqs, uint, 0444); +MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); + module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4e190c131e1d..d9fc6cf6af34 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1557,17 +1557,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) * returning. */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, +dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, db_lock_type_t dblt, const void *tag) { - dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp; + blkptr_t bp, *bpp = NULL; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); @@ -1580,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, goto early_unlock; } - if (db->db_state == DB_UNCACHED) { - if (db->db_blkptr == NULL) { - bpp = NULL; - } else { - bp = *db->db_blkptr; + /* + * If we have a pending block clone, we don't want to read the + * underlying block, but the content of the block being cloned, + * pointed by the dirty record, so we have the most recent data. + * If there is no dirty record, then we hit a race in a sync + * process when the dirty record is already removed, while the + * dbuf is not yet destroyed. Such case is equivalent to uncached. + */ + if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (!dr->dt.dl.dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dr->dt.dl.dr_overridden_by; bpp = &bp; } - } else { - dbuf_dirty_record_t *dr; - - ASSERT3S(db->db_state, ==, DB_NOFILL); + } - /* - * Block cloning: If we have a pending block clone, - * we don't want to read the underlying block, but the content - * of the block being cloned, so we have the most recent data. - */ - dr = list_head(&db->db_dirty_records); - if (dr == NULL || !dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dr->dt.dl.dr_overridden_by; + if (bpp == NULL && db->db_blkptr != NULL) { + bp = *db->db_blkptr; bpp = &bp; } @@ -1643,8 +1639,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, if (err != 0) goto early_unlock; - DB_DNODE_EXIT(db); - db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); @@ -1669,12 +1663,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * parent's rwlock, which would be a lock ordering violation. */ dmu_buf_unlock_parent(db, dblt, tag); - (void) arc_read(zio, db->db_objset->os_spa, bpp, + return (arc_read(zio, db->db_objset->os_spa, bpp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, - &aflags, &zb); - return (err); + &aflags, &zb)); + early_unlock: - DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); @@ -1759,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) { int err = 0; boolean_t prefetch; @@ -1775,7 +1768,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dn = DB_DNODE(db); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL; + (flags & DB_RF_NOPREFETCH) == 0; mutex_enter(&db->db_mtx); if (flags & DB_RF_PARTIAL_FIRST) @@ -1822,13 +1815,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - if (zio == NULL && (db->db_state == DB_NOFILL || + if (pio == NULL && (db->db_state == DB_NOFILL || (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us @@ -1849,9 +1842,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ if (need_wait) { if (err == 0) - err = zio_wait(zio); + err = zio_wait(pio); else - VERIFY0(zio_wait(zio)); + (void) zio_wait(pio); + pio = NULL; } } else { /* @@ -1878,7 +1872,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, zio); + db, zio_t *, pio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) @@ -1887,6 +1881,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) } } + if (pio && err != 0) { + zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, + ZIO_FLAG_CANFAIL); + zio->io_error = err; + zio_nowait(zio); + } + return (err); } @@ -2631,26 +2632,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* - * Quick check for dirtiness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). + * Quick check for dirtiness to improve performance for some workloads + * (e.g. file deletion with indirect blocks cached). */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* - * It's possible that it is already dirty but not cached, + * It's possible that the dbuf is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr != NULL) { - if (dr->dt.dl.dr_brtwrite) { + if (db->db_level == 0 && + dr->dt.dl.dr_brtwrite) { /* * Block cloning: If we are dirtying a cloned - * block, we cannot simply redirty it, because - * this dr has no data associated with it. + * level 0 block, we cannot simply redirty it, + * because this dr has no associated data. * We will go through a full undirtying below, * before dirtying it again. */ @@ -4597,11 +4596,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); - if (db->db_state != DB_NOFILL && + if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { + dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), @@ -4890,11 +4888,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != NULL && - dr->dt.dl.dr_data != db->db_buf) { - arc_buf_destroy(dr->dt.dl.dr_data, db); - } + if (dr->dt.dl.dr_data != NULL && + dr->dt.dl.dr_data != db->db_buf) { + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -5097,21 +5093,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) os = dn->dn_objset; - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather than in + * dbuf_dirty() since they are only modified in the syncing + * context and we don't want the overhead of making multiple + * copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) + arc_buf_thaw(data); + else + dbuf_release_bp(db); + dbuf_remap(dn, db, tx); } if (parent != dn->dn_dbuf) { @@ -5147,7 +5140,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + wp_flag |= (data == NULL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); @@ -5179,7 +5172,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { + } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 753dde6d5205..6ef149aab9a6 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { - if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + if (zs) { + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, + B_TRUE); + } rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) @@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zfs_racct_write(length, nblks); if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 2b2d72671001..915d99916d2e 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -65,9 +65,16 @@ unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; +/* max request reorder distance within a stream (default 16MB) */ +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +/* Max log2 fraction of holes in a stream */ +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_future; + kstat_named_t zfetchstat_stride; + kstat_named_t zfetchstat_past; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; @@ -76,6 +83,9 @@ typedef struct zfetch_stats { static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "future", KSTAT_DATA_UINT64 }, + { "stride", KSTAT_DATA_UINT64 }, + { "past", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, { "io_issued", KSTAT_DATA_UINT64 }, @@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = { struct { wmsum_t zfetchstat_hits; + wmsum_t zfetchstat_future; + wmsum_t zfetchstat_stride; + wmsum_t zfetchstat_past; wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; @@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw) return (EACCES); zs->zfetchstat_hits.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_hits); + zs->zfetchstat_future.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_future); + zs->zfetchstat_stride.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_stride); + zs->zfetchstat_past.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_past); zs->zfetchstat_misses.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_misses); zs->zfetchstat_max_streams.value.ui64 = @@ -122,6 +141,9 @@ void zfetch_init(void) { wmsum_init(&zfetch_sums.zfetchstat_hits, 0); + wmsum_init(&zfetch_sums.zfetchstat_future, 0); + wmsum_init(&zfetch_sums.zfetchstat_stride, 0); + wmsum_init(&zfetch_sums.zfetchstat_past, 0); wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); @@ -147,6 +169,9 @@ zfetch_fini(void) } wmsum_fini(&zfetch_sums.zfetchstat_hits); + wmsum_fini(&zfetch_sums.zfetchstat_future); + wmsum_fini(&zfetch_sums.zfetchstat_stride); + wmsum_fini(&zfetch_sums.zfetchstat_past); wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); @@ -222,22 +247,22 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs, *zs_next, *zs_old = NULL; - hrtime_t now = gethrtime(), t; + uint_t now = gethrestime_sec(), t; ASSERT(MUTEX_HELD(&zf->zf_lock)); /* * Delete too old streams, reusing the first found one. */ - t = now - SEC2NSEC(zfetch_max_sec_reap); + t = now - zfetch_max_sec_reap; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* * Skip if still active. 1 -- zf_stream reference. */ - if (zfs_refcount_count(&zs->zs_refs) != 1) + if ((int)(zs->zs_atime - t) >= 0) continue; - if (zs->zs_atime > t) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; if (zs_old) dmu_zfetch_stream_remove(zf, zs); @@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) } if (zs_old) { zs = zs_old; + list_remove(&zf->zf_stream, zs); goto reuse; } @@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) * for all the streams to be non-overlapping. */ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, - zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / zfetch_max_distance)); if (zf->zf_numstreams >= max_streams) { - t = now - SEC2NSEC(zfetch_min_sec_reap); + t = now - zfetch_min_sec_reap; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { - if (zfs_refcount_count(&zs->zs_refs) != 1) + if ((int)(zs->zs_atime - t) >= 0) continue; - if (zs->zs_atime > t) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; - if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) + if (zs_old == NULL || + (int)(zs_old->zs_atime - zs->zs_atime) >= 0) zs_old = zs; } if (zs_old) { zs = zs_old; + list_remove(&zf->zf_stream, zs); goto reuse; } ZFETCHSTAT_BUMP(zfetchstat_max_streams); @@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) } zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); - zs->zs_fetch = zf; zfs_refcount_create(&zs->zs_callers); zfs_refcount_create(&zs->zs_refs); /* One reference for zf_stream. */ zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; - list_insert_head(&zf->zf_stream, zs); reuse: + list_insert_head(&zf->zf_stream, zs); zs->zs_blkid = blkid; + /* Allow immediate stream reuse until first hit. */ + zs->zs_atime = now - zfetch_min_sec_reap; + memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); zs->zs_pf_dist = 0; + zs->zs_ipf_dist = 0; zs->zs_pf_start = blkid; zs->zs_pf_end = blkid; - zs->zs_ipf_dist = 0; zs->zs_ipf_start = blkid; zs->zs_ipf_end = blkid; - /* Allow immediate stream reuse until first hit. */ - zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); zs->zs_missed = B_FALSE; zs->zs_more = B_FALSE; } @@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); } +/* + * Process stream hit access for nblks blocks starting at zs_blkid. Return + * number of blocks to proceed for after aggregation with future ranges. + */ +static uint64_t +dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) +{ + uint_t i, j; + + /* Optimize sequential accesses (no future ranges). */ + if (zs->zs_ranges[0].start == 0) + goto done; + + /* Look for intersections with further ranges. */ + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > nblks) + break; + if (r->end >= nblks) { + nblks = r->end; + i++; + break; + } + } + + /* Delete all found intersecting ranges, updates remaining. */ + for (j = 0; i < ZFETCH_RANGES; i++, j++) { + if (zs->zs_ranges[i].start == 0) + break; + ASSERT3U(zs->zs_ranges[i].start, >, nblks); + ASSERT3U(zs->zs_ranges[i].end, >, nblks); + zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; + zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; + } + if (j < ZFETCH_RANGES) { + zs->zs_ranges[j].start = 0; + zs->zs_ranges[j].end = 0; + } + +done: + zs->zs_blkid += nblks; + return (nblks); +} + +/* + * Process future stream access for nblks blocks starting at blkid. Return + * number of blocks to proceed for if future ranges reach fill threshold. + */ +static uint64_t +dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) +{ + ASSERT3U(blkid, >, zs->zs_blkid); + blkid -= zs->zs_blkid; + ASSERT3U(blkid + nblks, <=, UINT16_MAX); + + /* Search for first and last intersection or insert point. */ + uint_t f = ZFETCH_RANGES, l = 0, i; + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > blkid + nblks) + break; + if (r->end < blkid) + continue; + if (f > i) + f = i; + if (l < i) + l = i; + } + if (f <= l) { + /* Got some intersecting range, expand it if needed. */ + if (zs->zs_ranges[f].start > blkid) + zs->zs_ranges[f].start = blkid; + zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); + if (f < l) { + /* Got more than one intersection, remove others. */ + for (f++, l++; l < ZFETCH_RANGES; f++, l++) { + zs->zs_ranges[f].start = zs->zs_ranges[l].start; + zs->zs_ranges[f].end = zs->zs_ranges[l].end; + } + zs->zs_ranges[ZFETCH_RANGES - 1].start = 0; + zs->zs_ranges[ZFETCH_RANGES - 1].end = 0; + } + } else if (i < ZFETCH_RANGES) { + /* Got no intersecting ranges, insert new one. */ + for (l = ZFETCH_RANGES - 1; l > i; l--) { + zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; + zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; + } + zs->zs_ranges[i].start = blkid; + zs->zs_ranges[i].end = blkid + nblks; + } else { + /* No space left to insert. Drop the range. */ + return (0); + } + + /* Check if with the new access addition we reached fill threshold. */ + if (zfetch_hole_shift >= 16) + return (0); + uint_t hole = 0; + for (i = f = l = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0) + break; + hole += r->start - f; + f = r->end; + if (hole <= r->end >> zfetch_hole_shift) + l = r->end; + } + if (l > 0) + return (dmu_zfetch_hit(zs, l)); + + return (0); +} + /* * This is the predictive prefetch entry point. dmu_zfetch_prepare() * associates dnode access specified with blkid and nblks arguments with @@ -370,53 +512,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, mutex_enter(&zf->zf_lock); /* - * Find matching prefetch stream. Depending on whether the accesses + * Find perfect prefetch stream. Depending on whether the accesses * are block-aligned, first block of the new access may either follow * the last block of the previous access, or be equal to it. */ + unsigned int dbs = zf->zf_dnode->dn_datablkshift; + uint64_t end_blkid = blkid + nblks; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid) { - break; + goto hit; } else if (blkid + 1 == zs->zs_blkid) { blkid++; nblks--; - break; + goto hit; } } /* - * If the file is ending, remove the matching stream if found. - * If not found then it is too late to create a new one now. + * Find close enough prefetch stream. Access crossing stream position + * is a hit in its new part. Access ahead of stream position considered + * a hit for metadata prefetch, since we do not care about fill percent, + * or stored for future otherwise. Access behind stream position is + * silently ignored, since we already skipped it reaching fill percent. */ - uint64_t end_of_access_blkid = blkid + nblks; - if (end_of_access_blkid >= maxblkid) { - if (zs != NULL) - dmu_zfetch_stream_remove(zf, zs); - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); + uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); + uint_t t = gethrestime_sec() - zfetch_max_sec_reap; + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid > zs->zs_blkid) { + if (end_blkid <= zs->zs_blkid + max_reorder) { + if (!fetch_data) { + nblks = dmu_zfetch_hit(zs, + end_blkid - zs->zs_blkid); + ZFETCHSTAT_BUMP(zfetchstat_stride); + goto future; + } + nblks = dmu_zfetch_future(zs, blkid, nblks); + if (nblks > 0) + ZFETCHSTAT_BUMP(zfetchstat_stride); + else + ZFETCHSTAT_BUMP(zfetchstat_future); + goto future; + } + } else if (end_blkid >= zs->zs_blkid) { + nblks -= zs->zs_blkid - blkid; + blkid += zs->zs_blkid - blkid; + goto hit; + } else if (end_blkid + max_reorder > zs->zs_blkid && + (int)(zs->zs_atime - t) >= 0) { + ZFETCHSTAT_BUMP(zfetchstat_past); + zs->zs_atime = gethrestime_sec(); + goto out; + } } - /* Exit if we already prefetched this block before. */ - if (nblks == 0) { - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); - } + /* + * This access is not part of any existing stream. Create a new + * stream for it unless we are at the end of file. + */ + if (end_blkid < maxblkid) + dmu_zfetch_stream_create(zf, end_blkid); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); - if (zs == NULL) { - /* - * This access is not part of any existing stream. Create - * a new stream for it. - */ - dmu_zfetch_stream_create(zf, end_of_access_blkid); +hit: + nblks = dmu_zfetch_hit(zs, nblks); + ZFETCHSTAT_BUMP(zfetchstat_hits); + +future: + zs->zs_atime = gethrestime_sec(); + + /* Exit if we already prefetched for this position before. */ + if (nblks == 0) + goto out; + + /* If the file is ending, remove the stream. */ + end_blkid = zs->zs_blkid; + if (end_blkid >= maxblkid) { + dmu_zfetch_stream_remove(zf, zs); +out: mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - ZFETCHSTAT_BUMP(zfetchstat_misses); return (NULL); } @@ -432,7 +613,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * than ~6% of ARC held by active prefetches. It should help with * getting out of RAM on some badly mispredicted read patterns. */ - unsigned int dbs = zf->zf_dnode->dn_datablkshift; unsigned int nbytes = nblks << dbs; unsigned int pf_nblks; if (fetch_data) { @@ -452,10 +632,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, } else { pf_nblks = 0; } - if (zs->zs_pf_start < end_of_access_blkid) - zs->zs_pf_start = end_of_access_blkid; - if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) - zs->zs_pf_end = end_of_access_blkid + pf_nblks; + if (zs->zs_pf_start < end_blkid) + zs->zs_pf_start = end_blkid; + if (zs->zs_pf_end < end_blkid + pf_nblks) + zs->zs_pf_end = end_blkid + pf_nblks; /* * Do the same for indirects, starting where we will stop reading @@ -473,9 +653,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; - zs->zs_blkid = end_of_access_blkid; - /* Protect the stream from reclamation. */ - zs->zs_atime = gethrtime(); zfs_refcount_add(&zs->zs_refs, NULL); /* Count concurrent callers. */ zfs_refcount_add(&zs->zs_callers, NULL); @@ -483,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - - ZFETCHSTAT_BUMP(zfetchstat_hits); return (zs); } void -dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, + boolean_t have_lock) { - zfetch_t *zf = zs->zs_fetch; int64_t pf_start, pf_end, ipf_start, ipf_end; int epbs, issued; @@ -567,7 +742,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); if (zs) - dmu_zfetch_run(zs, missed, have_lock); + dmu_zfetch_run(zf, zs, missed, have_lock); } ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, @@ -590,3 +765,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, + "Max request reorder distance within a stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, + "Max log2 fraction of holes in a stream"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e96bbda35a04..08d56eef83e9 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4086,14 +4086,17 @@ zio_vdev_io_done(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_IOCTL || + zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { - vdev_queue_io_done(zio); + if (zio->io_type != ZIO_TYPE_IOCTL) + vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 609182f4a2cd..0a4851ecb40d 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -364,10 +364,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) int ret = 0; /* - * We skip over faults in the labels unless it's during - * device open (i.e. zio == NULL). + * We skip over faults in the labels unless it's during device open + * (i.e. zio == NULL) or a device flush (offset is meaningless) */ - if (zio != NULL) { + if (zio != NULL && zio->io_type != ZIO_TYPE_IOCTL) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index d4c5a21828a1..912344b4edde 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -153,6 +153,12 @@ tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] +[tests/functional/cli_root/zinject] +tests = ['zinject_args'] +pre = +post = +tags = ['functional', 'cli_root', 'zinject'] + [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 866ea5b9e7ec..db6b4c0146a7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -606,6 +606,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ + functional/cli_root/zinject/zinject_args.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh new file mode 100755 index 000000000000..f8a8ffbb7b0e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +# +# TODO: this only checks that the set of valid device fault types. It should +# check all the other options, and that they work, and everything really. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Check zinject parameters." + +log_onexit cleanup + +DISK1=${DISKS%% *} + +function cleanup +{ + zinject -c all + default_cleanup_noexit +} + +function test_device_fault +{ + typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt") + for e in ${errno[@]}; do + log_must eval \ + "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL" + done + zinject -c all +} + +default_mirror_setup_noexit $DISKS + +test_device_fault + +log_pass "zinject parameters work as expected."