diff --git a/.mailmap b/.mailmap index 46ef016b93f8..32bdb5209613 100644 --- a/.mailmap +++ b/.mailmap @@ -30,6 +30,7 @@ Andreas Dilger Andrew Walker Benedikt Neuffer Chengfei Zhu +ChenHao Lu <18302010006@fudan.edu.cn> Chris Lindee Colm Buckley Crag Wang @@ -43,6 +44,7 @@ Glenn Washburn Gordan Bobic Gregory Bartholomew hedong zhang +Ilkka Sovanto InsanePrawn Jason Cohen Jason Harmening @@ -57,6 +59,7 @@ KernelOfTruth Liu Hua Liu Qing loli10K +Mart Frauenlob Matthias Blankertz Michael Gmelin Olivier Mazouffre @@ -73,6 +76,9 @@ WHR Yanping Gao Youzhong Yang +# Signed-off-by: overriding Author: +Yuxin Wang + # Commits from strange places, long ago Brian Behlendorf Brian Behlendorf @@ -102,12 +108,15 @@ Brandon Thetford buzzingwires <131118055+buzzingwires@users.noreply.github.com> Cedric Maunoury <38213715+cedricmaunoury@users.noreply.github.com> Charles Suh +Chris Peredun <126915832+chrisperedun@users.noreply.github.com> Dacian Reece-Stremtan <35844628+dacianstremtan@users.noreply.github.com> Damian Szuberski <30863496+szubersk@users.noreply.github.com> Daniel Hiepler <32984777+heeplr@users.noreply.github.com> Daniel Kobras Daniel Reichelt David Quigley +Dennis R. Friedrichsen <31087738+dennisfriedrichsen@users.noreply.github.com> +Dex Wood DHE Dmitri John Ledkov <19779+xnox@users.noreply.github.com> Dries Michiels <32487486+driesmp@users.noreply.github.com> @@ -128,6 +137,7 @@ Harry Mallon <1816667+hjmallon@users.noreply.github.com> Hiếu Lê Jake Howard James Cowgill +Jaron Kent-Dobias Jason King Jeff Dike <52420226+jdike@users.noreply.github.com> Jitendra Patidar <53164267+jsai20@users.noreply.github.com> @@ -137,7 +147,9 @@ John L. Hammond <35266395+jhammond-intel@users.noreply. John-Mark Gurney John Ramsden Jonathon Fernyhough <559369+jonathonf@users.noreply.github.com> +Jose Luis Duran Justin Hibbits +Kevin Greene <104801862+kxgreene@users.noreply.github.com> Kevin Jin <33590050+jxdking@users.noreply.github.com> Kevin P. Fleming Krzysztof Piecuch <3964215+pikrzysztof@users.noreply.github.com> @@ -148,9 +160,11 @@ Lorenz Hüdepohl Luís Henriques <73643340+lumigch@users.noreply.github.com> Marcin Skarbek Matt Fiddaman <81489167+matt-fidd@users.noreply.github.com> +Maxim Filimonov Max Zettlmeißl <6818198+maxz@users.noreply.github.com> Michael Niewöhner Michael Zhivich <33133421+mzhivich@users.noreply.github.com> +MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Mo Zhou <5723047+cdluminate@users.noreply.github.com> Nick Mattis omni <79493359+omnivagant@users.noreply.github.com> @@ -164,6 +178,7 @@ Ping Huang <101400146+hpingfs@users.noreply.github.com> Piotr P. Stefaniak Richard Allen <33836503+belperite@users.noreply.github.com> Rich Ercolani <214141+rincebrain@users.noreply.github.com> +Rick Macklem <64620010+rmacklem@users.noreply.github.com> Rob Wing <98866084+rob-wing@users.noreply.github.com> Roman Strashkin Ryan Hirasaki <4690732+RyanHir@users.noreply.github.com> @@ -174,6 +189,8 @@ Scott Colby Sean Eric Fagan Spencer Kinny <30333052+Spencer-Kinny@users.noreply.github.com> Srikanth N S <75025422+nssrikanth@users.noreply.github.com> +Stefan Lendl <1321542+stfl@users.noreply.github.com> +Thomas Bertschinger <101425190+bertschinger@users.noreply.github.com> Thomas Geppert Tim Crawford Tom Matthews @@ -181,6 +198,7 @@ Tony Perkins <62951051+tony-zfs@users.noreply.github.com> Torsten Wörtwein Tulsi Jain Václav Skála <33496485+vaclavskala@users.noreply.github.com> +Vaibhav Bhanawat <88050553+vaibhav-delphix@users.noreply.github.com> Violet Purcell <66446404+vimproved@users.noreply.github.com> Vipin Kumar Verma <75025470+vermavipinkumar@users.noreply.github.com> Wolfgang Bumiller diff --git a/AUTHORS b/AUTHORS index be1efb87b34c..d7d55f42d2e7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -88,9 +88,11 @@ CONTRIBUTORS: Bassu Ben Allen Ben Cordero + Benda Xu Benedikt Neuffer Benjamin Albrecht Benjamin Gentil + Benjamin Sherman Ben McGough Ben Rubson Ben Wolsieffer @@ -111,6 +113,7 @@ CONTRIBUTORS: bzzz77 cable2999 Caleb James DeLisle + Cameron Harr Cao Xuewen Carlo Landmeter Carlos Alberto Lopez Perez @@ -120,12 +123,15 @@ CONTRIBUTORS: Chen Can Chengfei Zhu Chen Haiquan + ChenHao Lu <18302010006@fudan.edu.cn> Chip Parker Chris Burroughs + Chris Davidson Chris Dunlap Chris Dunlop Chris Lindee Chris McDonough + Chris Peredun Chris Siden Chris Siebenmann Christer Ekholm @@ -144,6 +150,7 @@ CONTRIBUTORS: Clint Armstrong Coleman Kane Colin Ian King + Colin Percival Colm Buckley Crag Wang Craig Loomis @@ -156,6 +163,7 @@ CONTRIBUTORS: Damiano Albani Damian Szuberski Damian Wojsław + Daniel Berlin Daniel Hiepler Daniel Hoffman Daniel Kobras @@ -176,8 +184,10 @@ CONTRIBUTORS: David Quigley Debabrata Banerjee D. Ebdrup + Dennis R. Friedrichsen Denys Rtveliashvili Derek Dai + Dex Wood DHE Didier Roche Dimitri John Ledkov @@ -235,9 +245,11 @@ CONTRIBUTORS: Gionatan Danti Giuseppe Di Natale Glenn Washburn + gofaster Gordan Bobic Gordon Bergling Gordon Ross + Gordon Tetlow Graham Christensen Graham Perrin Gregor Kopka @@ -265,6 +277,7 @@ CONTRIBUTORS: Igor Kozhukhov Igor Lvovsky ilbsmart + Ilkka Sovanto illiliti ilovezfs InsanePrawn @@ -280,9 +293,11 @@ CONTRIBUTORS: Jan Engelhardt Jan Kryl Jan Sanislo + Jaron Kent-Dobias Jason Cohen Jason Harmening Jason King + Jason Lee Jason Zaman Javen Wu Jean-Baptiste Lallement @@ -313,6 +328,7 @@ CONTRIBUTORS: Jonathon Fernyhough Jorgen Lundman Josef 'Jeff' Sipek + Jose Luis Duran Josh Soref Joshua M. Clulow José Luis Salvador Rufo @@ -336,8 +352,10 @@ CONTRIBUTORS: Kash Pande Kay Pedersen Keith M Wesolowski + Kent Ross KernelOfTruth Kevin Bowling + Kevin Greene Kevin Jin Kevin P. Fleming Kevin Tanguy @@ -389,6 +407,7 @@ CONTRIBUTORS: Mark Shellenbaum marku89 Mark Wright + Mart Frauenlob Martin Matuska Martin Rüegg Massimo Maggi @@ -405,6 +424,7 @@ CONTRIBUTORS: Matus Kral Mauricio Faria de Oliveira Max Grossman + Maxim Filimonov Maximilian Mehnert Max Zettlmeißl Md Islam @@ -417,6 +437,7 @@ CONTRIBUTORS: Michael Niewöhner Michael Zhivich Michal Vasilek + MigeljanImeri Mike Gerdts Mike Harsch Mike Leddy @@ -448,6 +469,7 @@ CONTRIBUTORS: Olaf Faaland Oleg Drokin Oleg Stepura + Olivier Certner Olivier Mazouffre omni Orivej Desh @@ -479,6 +501,7 @@ CONTRIBUTORS: Prasad Joshi privb0x23 P.SCH + Quartz Quentin Zdanis Rafael Kitover RageLtMan @@ -491,11 +514,15 @@ CONTRIBUTORS: Riccardo Schirone Richard Allen Richard Elling + Richard Kojedzinszky Richard Laager Richard Lowe Richard Sharpe Richard Yao Rich Ercolani + Rick Macklem + rilysh + Robert Evans Robert Novak Roberto Ricci Rob Norris @@ -509,7 +536,9 @@ CONTRIBUTORS: Ryan Lahfa Ryan Libby Ryan Moeller + Sam Atkinson Sam Hathaway + Sam James Sam Lunt Samuel VERSCHELDE Samuel Wycliffe @@ -530,6 +559,8 @@ CONTRIBUTORS: Shaan Nobee Shampavman Shaun Tancheff + Shawn Bayern + Shengqi Chen Shen Yan Simon Guest Simon Klinkert @@ -537,6 +568,7 @@ CONTRIBUTORS: Spencer Kinny Srikanth N S Stanislav Seletskiy + Stefan Lendl Steffen Müthing Stephen Blinick sterlingjensen @@ -557,6 +589,7 @@ CONTRIBUTORS: Teodor Spæren TerraTech Thijs Cramer + Thomas Bertschinger Thomas Geppert Thomas Lamprecht Till Maas @@ -586,6 +619,7 @@ CONTRIBUTORS: Turbo Fredriksson Tyler J. Stachecki Umer Saleem + Vaibhav Bhanawat Valmiky Arquissandas Val Packett Vince van Oosten @@ -614,6 +648,7 @@ CONTRIBUTORS: yuina822 YunQiang Su Yuri Pankov + Yuxin Wang Yuxuan Shui Zachary Bedell Zach Dykstra diff --git a/META b/META index 8a257f0feb17..19a796050f5b 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.7 +Linux-Maximum: 6.8 Linux-Minimum: 3.10 diff --git a/cmd/arc_summary b/cmd/arc_summary index 9c69ec4f8ccc..100fb1987a8b 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -793,18 +793,27 @@ def section_dmu(kstats_dict): zfetch_stats = isolate_section('zfetchstats', kstats_dict) - zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) + zfetch_access_total = int(zfetch_stats['hits']) +\ + int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\ + int(zfetch_stats['past']) + int(zfetch_stats['misses']) prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total)) prt_i2('Stream hits:', f_perc(zfetch_stats['hits'], zfetch_access_total), f_hits(zfetch_stats['hits'])) + future = int(zfetch_stats['future']) + int(zfetch_stats['stride']) + prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total), + f_hits(future)) + prt_i2('Hits behind stream:', + f_perc(zfetch_stats['past'], zfetch_access_total), + f_hits(zfetch_stats['past'])) prt_i2('Stream misses:', f_perc(zfetch_stats['misses'], zfetch_access_total), f_hits(zfetch_stats['misses'])) prt_i2('Streams limit reached:', f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']), f_hits(zfetch_stats['max_streams'])) + prt_i1('Stream strides:', f_hits(zfetch_stats['stride'])) prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued'])) print() diff --git a/cmd/arcstat.in b/cmd/arcstat.in index 8df1c62f7e86..c4f10a1d6d3b 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -157,6 +157,16 @@ cols = { "free": [5, 1024, "ARC free memory"], "avail": [5, 1024, "ARC available memory"], "waste": [5, 1024, "Wasted memory due to round up to pagesize"], + "ztotal": [6, 1000, "zfetch total prefetcher calls per second"], + "zhits": [5, 1000, "zfetch stream hits per second"], + "zahead": [6, 1000, "zfetch hits ahead of streams per second"], + "zpast": [5, 1000, "zfetch hits behind streams per second"], + "zmisses": [7, 1000, "zfetch stream misses per second"], + "zmax": [4, 1000, "zfetch limit reached per second"], + "zfuture": [7, 1000, "zfetch stream future per second"], + "zstride": [7, 1000, "zfetch stream strides per second"], + "zissued": [7, 1000, "zfetch prefetches issued per second"], + "zactive": [7, 1000, "zfetch prefetches active per second"], } v = {} @@ -164,6 +174,8 @@ hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%", "size", "c", "avail"] xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis", "dread", "pread", "read"] +zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax", + "zfuture", "zstride", "zissued", "zactive"] sint = 1 # Default interval is 1 second count = 1 # Default count is 1 hdr_intr = 20 # Print header every 20 lines of output @@ -188,6 +200,8 @@ if sys.platform.startswith('freebsd'): k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') if ctl.type != sysctl.CTLTYPE_NODE] + k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats') + if ctl.type != sysctl.CTLTYPE_NODE] if not k: sys.exit(1) @@ -199,19 +213,28 @@ if sys.platform.startswith('freebsd'): continue name, value = s.name, s.value - # Trims 'kstat.zfs.misc.arcstats' from the name - kstat[name[24:]] = int(value) + + if "arcstats" in name: + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + else: + kstat["zfetch_" + name[27:]] = int(value) elif sys.platform.startswith('linux'): def kstat_update(): global kstat - k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] - if not k: + k2 = ["zfetch_" + line.strip() for line in + open('/proc/spl/kstat/zfs/zfetchstats')] + + if k1 is None or k2 is None: sys.exit(1) - del k[0:2] + del k1[0:2] + del k2[0:2] + k = k1 + k2 kstat = {} for s in k: @@ -239,6 +262,7 @@ def usage(): sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -z : Print zfetch stats\n") sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " @@ -357,6 +381,7 @@ def init(): global count global hdr global xhdr + global zhdr global opfile global sep global out @@ -368,15 +393,17 @@ def init(): xflag = False hflag = False vflag = False + zflag = False i = 1 try: opts, args = getopt.getopt( sys.argv[1:], - "axo:hvs:f:p", + "axzo:hvs:f:p", [ "all", "extended", + "zfetch", "outfile", "help", "verbose", @@ -410,13 +437,15 @@ def init(): i += 1 if opt in ('-p', '--parsable'): pretty_print = False + if opt in ('-z', '--zfetch'): + zflag = True i += 1 argv = sys.argv[i:] sint = int(argv[0]) if argv else sint count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) - if hflag or (xflag and desired_cols): + if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols): usage() if vflag: @@ -425,6 +454,9 @@ def init(): if xflag: hdr = xhdr + if zflag: + hdr = zhdr + update_hdr_intr() # check if L2ARC exists @@ -569,6 +601,17 @@ def calculate(): v["el2mru"] = d["evict_l2_eligible_mru"] // sint v["el2inel"] = d["evict_l2_ineligible"] // sint v["mtxmis"] = d["mutex_miss"] // sint + v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] + + d["zfetch_past"] + d["zfetch_misses"]) // sint + v["zhits"] = d["zfetch_hits"] // sint + v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) // sint + v["zpast"] = d["zfetch_past"] // sint + v["zmisses"] = d["zfetch_misses"] // sint + v["zmax"] = d["zfetch_max_streams"] // sint + v["zfuture"] = d["zfetch_future"] // sint + v["zstride"] = d["zfetch_stride"] // sint + v["zissued"] = d["zfetch_io_issued"] // sint + v["zactive"] = d["zfetch_io_active"] // sint if l2exist: v["l2hits"] = d["l2_hits"] // sint diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 558576a306d3..797ae34b6e53 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -120,6 +120,9 @@ static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; +static spa_t *spa; +static objset_t *os; +static boolean_t kernel_init_done; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); @@ -131,6 +134,7 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, static void zdb_print_blkptr(const blkptr_t *bp, int flags); +static void zdb_exit(int reason); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ @@ -199,7 +203,8 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, break; sublivelist_verify_block_t svb = { .svb_dva = bp->blk_dva[i], - .svb_allocated_txg = bp->blk_birth + .svb_allocated_txg = + BP_GET_LOGICAL_BIRTH(bp) }; if (zfs_btree_find(&sv->sv_leftover, &svb, @@ -817,7 +822,7 @@ usage(void) (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); - exit(1); + zdb_exit(1); } static void @@ -848,7 +853,7 @@ fatal(const char *fmt, ...) dump_debug_buffer(); - exit(1); + zdb_exit(1); } static void @@ -2275,7 +2280,7 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); @@ -2340,7 +2345,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), - (u_longlong_t)bp->blk_birth); + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); return; } @@ -2358,7 +2363,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)bp->blk_birth); + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), @@ -2366,8 +2371,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), - (u_longlong_t)bp->blk_birth, - (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), + (u_longlong_t)BP_GET_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); @@ -2417,7 +2422,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, { int err = 0; - if (bp->blk_birth == 0) + if (BP_GET_LOGICAL_BIRTH(bp) == 0) return (0); print_indirect(spa, bp, zb, dnp); @@ -2605,7 +2610,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; - if (bp->blk_birth != 0) { + if (BP_GET_LOGICAL_BIRTH(bp) != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } @@ -2646,7 +2651,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; - ASSERT(bp->blk_birth != 0); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); @@ -3230,6 +3235,23 @@ fuid_table_destroy(void) } } +static void +zdb_exit(int reason) +{ + if (os != NULL) { + close_objset(os, FTAG); + } else if (spa != NULL) { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + if (kernel_init_done) + kernel_fini(); + + exit(reason); +} + /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. @@ -4160,32 +4182,32 @@ dump_cachefile(const char *cachefile) if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); - exit(1); + zdb_exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); - exit(1); + zdb_exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); - exit(1); + zdb_exit(1); } free(buf); @@ -5101,14 +5123,14 @@ dump_label(const char *dev) if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); - exit(1); + zdb_exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); - exit(1); + zdb_exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) @@ -5788,7 +5810,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (zb->zb_level == ZB_DNODE_LEVEL) return (0); - if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " @@ -8220,7 +8242,7 @@ dump_zpool(spa_t *spa) if (rc != 0) { dump_debug_buffer(); - exit(rc); + zdb_exit(rc); } } @@ -8824,18 +8846,18 @@ zdb_embedded_block(char *thing) words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); - exit(1); + zdb_exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); - exit(1); + zdb_exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); - exit(1); + zdb_exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); @@ -8862,8 +8884,6 @@ int main(int argc, char **argv) { int c; - spa_t *spa = NULL; - objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; @@ -9092,6 +9112,7 @@ main(int argc, char **argv) spa_mode_readable_spacemaps = B_TRUE; kernel_init(SPA_MODE_READ); + kernel_init_done = B_TRUE; if (dump_all) verbose = MAX(verbose, 1); @@ -9115,19 +9136,23 @@ main(int argc, char **argv) if (argc != 1) usage(); zdb_embedded_block(argv[0]); - return (0); + error = 0; + goto fini; } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); - return (0); + error = 0; + goto fini; } usage(); } - if (dump_opt['l']) - return (dump_label(argv[0])); + if (dump_opt['l']) { + error = dump_label(argv[0]); + goto fini; + } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | @@ -9182,7 +9207,8 @@ main(int argc, char **argv) } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); - exit(1); + error = 1; + goto fini; } } else { target_pool = target; @@ -9239,7 +9265,8 @@ main(int argc, char **argv) if (argc != 2) usage(); dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1], NULL)); + error = dump_path(argv[0], argv[1], NULL); + goto fini; } if (dump_opt['r']) { @@ -9327,7 +9354,7 @@ main(int argc, char **argv) fatal("can't dump '%s': %s", target, strerror(error)); } - return (error); + goto fini; } else { target_pool = strdup(target); if (strpbrk(target, "/@") != NULL) @@ -9457,9 +9484,10 @@ main(int argc, char **argv) free(checkpoint_target); } +fini: if (os != NULL) { close_objset(os, FTAG); - } else { + } else if (spa != NULL) { spa_close(spa, FTAG); } @@ -9467,7 +9495,8 @@ main(int argc, char **argv) dump_debug_buffer(); - kernel_fini(); + if (kernel_init_done) + kernel_fini(); return (error); } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 63d95ddedc3b..e3caaeb70e14 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -173,8 +173,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, - !BP_IS_HOLE(bp) && - bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= + spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); @@ -186,7 +186,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) (void) printf("%s\n", tab_prefix); return; } - if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) { (void) printf("%s\n", tab_prefix); return; @@ -237,8 +237,8 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, - !BP_IS_HOLE(bp) && - bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= + spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); } @@ -473,7 +473,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg, if (claim_txg != 0) claim = "already claimed"; - else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) + else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa)) claim = "will claim"; else claim = "won't claim"; diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index c2147c8f4acd..0bbdd5b18eda 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -309,7 +309,8 @@ get_usage(zfs_help_t idx) "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" - "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); + "\tmount [-flvO] [-o opts] <-a|-R filesystem|" + "filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: @@ -2145,15 +2146,25 @@ found2:; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_opts[] = { - "filesystem", "volume", - "snapshot", "snap", + "filesystem", + "fs", + "volume", + "vol", + "snapshot", + "snap", "bookmark", - "all" }; + "all" + }; static const int type_types[] = { - ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, - ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_VOLUME, + ZFS_TYPE_VOLUME, + ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK + }; for (i = 0; i < ARRAY_SIZE(type_opts); ++i) if (strcmp(tok, type_opts[i]) == 0) { @@ -6754,6 +6765,8 @@ zfs_do_holds(int argc, char **argv) #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { + char **ga_datasets; + int ga_count; boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; @@ -6800,19 +6813,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data) return (0); } -static void -get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) +static int +get_recursive_datasets(zfs_handle_t *zhp, void *data) { - get_all_state_t state = { - .ga_verbose = verbose, - .ga_cbp = cbp - }; + get_all_state_t *state = data; + int len = strlen(zfs_get_name(zhp)); + for (int i = 0; i < state->ga_count; ++i) { + if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0) + return (get_one_dataset(zhp, data)); + else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp), + len) == 0) && state->ga_datasets[i][len] == '/') { + (void) zfs_iter_filesystems_v2(zhp, 0, + get_recursive_datasets, data); + } + } + zfs_close(zhp); + return (0); +} - if (verbose) +static void +get_all_datasets(get_all_state_t *state) +{ + if (state->ga_verbose) set_progress_header(gettext("Reading ZFS config")); - (void) zfs_iter_root(g_zfs, get_one_dataset, &state); + if (state->ga_datasets == NULL) + (void) zfs_iter_root(g_zfs, get_one_dataset, state); + else + (void) zfs_iter_root(g_zfs, get_recursive_datasets, state); - if (verbose) + if (state->ga_verbose) finish_progress(gettext("done.")); } @@ -7158,18 +7187,22 @@ static int share_mount(int op, int argc, char **argv) { int do_all = 0; + int recursive = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; + case 'R': + recursive = 1; + break; case 'v': verbose = B_TRUE; break; @@ -7211,7 +7244,7 @@ share_mount(int op, int argc, char **argv) argv += optind; /* check number of arguments */ - if (do_all) { + if (do_all || recursive) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { @@ -7220,14 +7253,38 @@ share_mount(int op, int argc, char **argv) argv++; } - if (argc != 0) { + if (argc != 0 && do_all) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } + if (argc == 0 && recursive) { + (void) fprintf(stderr, + gettext("no dataset provided\n")); + usage(B_FALSE); + } + start_progress_timer(); get_all_cb_t cb = { 0 }; - get_all_datasets(&cb, verbose); + get_all_state_t state = { 0 }; + if (argc == 0) { + state.ga_datasets = NULL; + state.ga_count = -1; + } else { + zfs_handle_t *zhp; + for (int i = 0; i < argc; i++) { + zhp = zfs_open(g_zfs, argv[i], + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + usage(B_FALSE); + zfs_close(zhp); + } + state.ga_datasets = argv; + state.ga_count = argc; + } + state.ga_verbose = verbose; + state.ga_cbp = &cb; + get_all_datasets(&state); if (cb.cb_used == 0) { free(options); diff --git a/cmd/zhack.c b/cmd/zhack.c index 44611887dd25..f15a6ece538c 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -612,8 +612,8 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) * Uberblock root block pointer has valid birth TXG. * Copying it to the label NVlist */ - if (ub->ub_rootbp.blk_birth != 0) { - const uint64_t txg = ub->ub_rootbp.blk_birth; + if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) { + const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp); ub->ub_txg = txg; if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index a11b6d0b7fac..ed60cce3dd16 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023-2024, Klara Inc. */ /* @@ -208,6 +209,38 @@ type_to_name(uint64_t type) } } +struct errstr { + int err; + const char *str; +}; +static const struct errstr errstrtable[] = { + { EIO, "io" }, + { ECKSUM, "checksum" }, + { EINVAL, "decompress" }, + { EACCES, "decrypt" }, + { ENXIO, "nxio" }, + { ECHILD, "dtl" }, + { EILSEQ, "corrupt" }, + { ENOSYS, "noop" }, + { 0, NULL }, +}; + +static int +str_to_err(const char *str) +{ + for (int i = 0; errstrtable[i].str != NULL; i++) + if (strcasecmp(errstrtable[i].str, str) == 0) + return (errstrtable[i].err); + return (-1); +} +static const char * +err_to_str(int err) +{ + for (int i = 0; errstrtable[i].str != NULL; i++) + if (errstrtable[i].err == err) + return (errstrtable[i].str); + return ("[unknown]"); +} /* * Print usage message. @@ -233,12 +266,12 @@ usage(void) "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" - "\t\t[-T ] [-f frequency] pool\n\n" + "\t\t[-T ] [-f frequency] pool\n\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" - "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n" - "\t\t'corrupt' (bit flip).\n" + "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n" + "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n" "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" "\t\tdevice error injection to a percentage of the IOs.\n" "\n" @@ -277,6 +310,11 @@ usage(void) "\t\tcreate 3 lanes on the device; one lane with a latency\n" "\t\tof 10 ms and two lanes with a 25 ms latency.\n" "\n" + "\tzinject -P import|export -s pool\n" + "\t\tAdd an artificial delay to a future pool import or export,\n" + "\t\tsuch that the operation takes a minimum of supplied seconds\n" + "\t\tto complete.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -359,8 +397,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, { int *count = data; - if (record->zi_guid != 0 || record->zi_func[0] != '\0') + if (record->zi_guid != 0 || record->zi_func[0] != '\0' || + record->zi_duration != 0) { return (0); + } if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " @@ -392,6 +432,10 @@ static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { + static const char *iotypestr[] = { + "null", "read", "write", "free", "claim", "flush", "trim", "all", + }; + int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') @@ -401,14 +445,21 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); - (void) printf("--- --------------- ----------------\n"); + (void) printf("%3s %-15s %-16s %-5s %-10s %-9s\n", + "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ"); + (void) printf( + "--- --------------- ---------------- " + "----- ---------- ---------\n"); } *count += 1; - (void) printf("%3d %-15s %llx\n", id, pool, - (u_longlong_t)record->zi_guid); + double freq = record->zi_freq == 0 ? 100.0f : + (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f; + + (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%%\n", id, pool, + (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype], + err_to_str(record->zi_error), freq); return (0); } @@ -463,6 +514,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record, return (0); } +static int +print_pool_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_cmd != ZINJECT_DELAY_IMPORT && + record->zi_cmd != ZINJECT_DELAY_EXPORT) { + return (0); + } + + if (*count == 0) { + (void) printf("%3s %-19s %-11s %s\n", + "ID", "POOL", "DELAY (sec)", "COMMAND"); + (void) printf("--- ------------------- -----------" + " -------\n"); + } + + *count += 1; + + (void) printf("%3d %-19s %-11llu %s\n", + id, pool, (u_longlong_t)record->zi_duration, + record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export"); + + return (0); +} + /* * Print all registered error handlers. Returns the number of handlers * registered. @@ -493,6 +571,13 @@ print_all_handlers(void) count = 0; } + (void) iter_handlers(print_pool_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_panic_handler, &count); return (count + total); @@ -565,9 +650,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record, zc.zc_guid = flags; if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to add handler: %s\n", - errno == EDOM ? "block level exceeds max level of object" : - strerror(errno)); + const char *errmsg = strerror(errno); + + switch (errno) { + case EDOM: + errmsg = "block level exceeds max level of object"; + break; + case EEXIST: + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "pool already imported"; + if (record->zi_cmd == ZINJECT_DELAY_EXPORT) + errmsg = "a handler already exists"; + break; + case ENOENT: + /* import delay injector running on older zfs module */ + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "import delay injector not supported"; + break; + default: + break; + } + (void) fprintf(stderr, "failed to add handler: %s\n", errmsg); return (1); } @@ -592,6 +695,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record, } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); + } else if (record->zi_timer > 0) { + (void) printf(" timer: %lld ms\n", + (u_longlong_t)NSEC2MSEC(record->zi_timer)); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); @@ -790,7 +896,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -842,24 +948,12 @@ main(int argc, char **argv) } break; case 'e': - if (strcasecmp(optarg, "io") == 0) { - error = EIO; - } else if (strcasecmp(optarg, "checksum") == 0) { - error = ECKSUM; - } else if (strcasecmp(optarg, "decompress") == 0) { - error = EINVAL; - } else if (strcasecmp(optarg, "decrypt") == 0) { - error = EACCES; - } else if (strcasecmp(optarg, "nxio") == 0) { - error = ENXIO; - } else if (strcasecmp(optarg, "dtl") == 0) { - error = ECHILD; - } else if (strcasecmp(optarg, "corrupt") == 0) { - error = EILSEQ; - } else { + error = str_to_err(optarg); + if (error < 0) { (void) fprintf(stderr, "invalid error type " - "'%s': must be 'io', 'checksum' or " - "'nxio'\n", optarg); + "'%s': must be one of: io decompress " + "decrypt nxio dtl corrupt noop\n", + optarg); usage(); libzfs_fini(g_zfs); return (1); @@ -920,6 +1014,19 @@ main(int argc, char **argv) sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; + case 'P': + if (strcasecmp(optarg, "import") == 0) { + record.zi_cmd = ZINJECT_DELAY_IMPORT; + } else if (strcasecmp(optarg, "export") == 0) { + record.zi_cmd = ZINJECT_DELAY_EXPORT; + } else { + (void) fprintf(stderr, "invalid command '%s': " + "must be 'import' or 'export'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'q': quiet = 1; break; @@ -947,12 +1054,14 @@ main(int argc, char **argv) io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "flush") == 0) { + io_type = ZIO_TYPE_FLUSH; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " - "'claim' or 'all'\n", optarg); + "'claim', 'flush' or 'all'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); @@ -999,7 +1108,7 @@ main(int argc, char **argv) argc -= optind; argv += optind; - if (record.zi_duration != 0) + if (record.zi_duration != 0 && record.zi_cmd == 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { @@ -1145,8 +1254,8 @@ main(int argc, char **argv) if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || dvas != 0) { - (void) fprintf(stderr, "panic (-p) incompatible with " - "other options\n"); + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); usage(); libzfs_fini(g_zfs); return (2); @@ -1164,6 +1273,28 @@ main(int argc, char **argv) if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_DELAY_IMPORT || + record.zi_cmd == ZINJECT_DELAY_EXPORT) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1 || record.zi_duration <= 0) { + (void) fprintf(stderr, "import|export delay (-P) " + "injection requires a duration (-s) and a single " + "pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_freq > 0 || dvas != 0) { diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c index 006a3a7d8e01..f194d28c55a9 100644 --- a/cmd/zpool/os/linux/zpool_vdev_os.c +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path) return (NULL); } - buf = calloc(sizeof (*buf), statbuf.st_size + 1); + buf = calloc(statbuf.st_size + 1, sizeof (*buf)); if (buf == NULL) { close(fd); return (NULL); @@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path) } /* Remove trailing newline */ - if (buf[count - 1] == '\n') + if (count > 0 && buf[count - 1] == '\n') buf[count - 1] = 0; close(fd); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 0783271f4734..300b383af4f6 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,13 @@ static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); +enum zpool_options { + ZPOOL_OPTION_POWER = 1024, + ZPOOL_OPTION_ALLOW_INUSE, + ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH, + ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH +}; + /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. @@ -347,7 +355,7 @@ get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fgLnP] [-o property=value] " + return (gettext("\tadd [-afgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " @@ -413,7 +421,7 @@ get_usage(zpool_help_t idx) "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [--power] [-c [script1,script2,...]] " - "[-igLpPstvxD] [-T d|u] [pool] ... \n" + "[-DegiLpPstvx] [-T d|u] [pool] ...\n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -1009,8 +1017,9 @@ add_prop_list_default(const char *propname, const char *propval, } /* - * zpool add [-fgLnP] [-o property=value] ... + * zpool add [-afgLnP] [-o property=value] ... * + * -a Disable the ashift validation checks * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -1026,8 +1035,11 @@ add_prop_list_default(const char *propname, const char *propval, int zpool_do_add(int argc, char **argv) { - boolean_t force = B_FALSE; + boolean_t check_replication = B_TRUE; + boolean_t check_inuse = B_TRUE; boolean_t dryrun = B_FALSE; + boolean_t check_ashift = B_TRUE; + boolean_t force = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; @@ -1038,8 +1050,18 @@ zpool_do_add(int argc, char **argv) nvlist_t *props = NULL; char *propval; + struct option long_options[] = { + {"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE}, + {"allow-replication-mismatch", no_argument, NULL, + ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH}, + {"allow-ashift-mismatch", no_argument, NULL, + ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "fgLno:P")) != -1) { + while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL)) + != -1) { switch (c) { case 'f': force = B_TRUE; @@ -1069,6 +1091,15 @@ zpool_do_add(int argc, char **argv) case 'P': name_flags |= VDEV_NAME_PATH; break; + case ZPOOL_OPTION_ALLOW_INUSE: + check_inuse = B_FALSE; + break; + case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH: + check_replication = B_FALSE; + break; + case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH: + check_ashift = B_FALSE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -1089,6 +1120,19 @@ zpool_do_add(int argc, char **argv) usage(B_FALSE); } + if (force) { + if (!check_inuse || !check_replication || !check_ashift) { + (void) fprintf(stderr, gettext("'-f' option is not " + "allowed with '--allow-replication-mismatch', " + "'--allow-ashift-mismatch', or " + "'--allow-in-use'\n")); + usage(B_FALSE); + } + check_inuse = B_FALSE; + check_replication = B_FALSE; + check_ashift = B_FALSE; + } + poolname = argv[0]; argc--; @@ -1119,8 +1163,8 @@ zpool_do_add(int argc, char **argv) } /* pass off to make_root_vdev for processing */ - nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, - argc, argv); + nvroot = make_root_vdev(zhp, props, !check_inuse, + check_replication, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); @@ -1224,7 +1268,7 @@ zpool_do_add(int argc, char **argv) ret = 0; } else { - ret = (zpool_add(zhp, nvroot) != 0); + ret = (zpool_add(zhp, nvroot, check_ashift) != 0); } nvlist_free(props); @@ -2246,7 +2290,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; @@ -2256,8 +2299,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_initialize_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: @@ -2297,7 +2340,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; @@ -2306,8 +2348,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_trim_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: @@ -3403,10 +3445,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to share some datasets")); + "successful, but unable to share some datasets\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to mount some datasets")); + "successful, but unable to mount some datasets\n")); } } @@ -3414,15 +3456,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +typedef struct import_parameters { + nvlist_t *ip_config; + const char *ip_mntopts; + nvlist_t *ip_props; + int ip_flags; + int *ip_err; +} import_parameters_t; + +static void +do_import_task(void *arg) +{ + import_parameters_t *ip = arg; + *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, + ip->ip_props, ip->ip_flags); + free(ip); +} + + static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, - char *orig_name, char *new_name, - boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, - importargs_t *import) + char *orig_name, char *new_name, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; + boolean_t pool_specified = (import->poolname != NULL || + import->guid != 0); + + + tpool_t *tp = NULL; + if (import->do_all) { + tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + } /* * At this point we have a list of import candidate configs. Even if @@ -3439,9 +3506,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + if (!import->do_destroyed && + pool_state == POOL_STATE_DESTROYED) continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + if (import->do_destroyed && + pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, @@ -3450,12 +3519,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, if (!pool_specified) { if (first) first = B_FALSE; - else if (!do_all) + else if (!import->do_all) (void) fputc('\n', stdout); - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); + if (import->do_all) { + import_parameters_t *ip = safe_malloc( + sizeof (import_parameters_t)); + + ip->ip_config = config; + ip->ip_mntopts = mntopts; + ip->ip_props = props; + ip->ip_flags = flags; + ip->ip_err = &err; + + (void) tpool_dispatch(tp, do_import_task, + (void *)ip); } else { /* * If we're importing from cachefile, then @@ -3503,6 +3581,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, found_config = config; } } + if (import->do_all) { + tpool_wait(tp); + tpool_destroy(tp); + } /* * If we were searching for a specific pool, verify that we found a @@ -3732,7 +3814,6 @@ zpool_do_import(int argc, char **argv) boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; @@ -3931,7 +4012,6 @@ zpool_do_import(int argc, char **argv) searchname = argv[0]; searchguid = 0; } - pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. @@ -3964,6 +4044,8 @@ zpool_do_import(int argc, char **argv) idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; + idata.do_destroyed = do_destroyed; + idata.do_all = do_all; libpc_handle_t lpch = { .lpc_lib_handle = g_zfs, @@ -4006,9 +4088,7 @@ zpool_do_import(int argc, char **argv) } err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); /* * If we're using the cachefile and we failed to import, then @@ -4029,9 +4109,8 @@ zpool_do_import(int argc, char **argv) pools = zpool_search_import(&lpch, &idata); err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, + &idata); } error: @@ -7081,7 +7160,6 @@ zpool_do_split(int argc, char **argv) return (ret); } -#define POWER_OPT 1024 /* * zpool online [--power] ... @@ -7099,7 +7177,7 @@ zpool_do_online(int argc, char **argv) int flags = 0; boolean_t is_power_on = B_FALSE; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7109,7 +7187,7 @@ zpool_do_online(int argc, char **argv) case 'e': flags |= ZFS_ONLINE_EXPAND; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': @@ -7222,7 +7300,7 @@ zpool_do_offline(int argc, char **argv) boolean_t is_power_off = B_FALSE; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7235,7 +7313,7 @@ zpool_do_offline(int argc, char **argv) case 't': istmp = B_TRUE; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_off = B_TRUE; break; case '?': @@ -7335,7 +7413,7 @@ zpool_do_clear(int argc, char **argv) char *pool, *device; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7352,7 +7430,7 @@ zpool_do_clear(int argc, char **argv) case 'X': xtreme_rewind = B_TRUE; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': @@ -8972,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " - "pool.\n")); + "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: @@ -9177,22 +9255,22 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ... + * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ... * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -D Display dedup status (undocumented) * -e Display only unhealthy vdevs - * -i Display vdev initialization status. * -g Display guid for individual vdev name. + * -i Display vdev initialization status. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. - * -v Display complete error logs - * -x Display only pools with potential problems - * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format + * -v Display complete error logs + * -x Display only pools with potential problems * --power Display vdev enclosure slot power status * * Describes the health status of all pools or some subset. @@ -9208,12 +9286,12 @@ zpool_do_status(int argc, char **argv) char *cmd = NULL; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; /* check options */ - while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9240,15 +9318,18 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; case 'e': cb.cb_print_unhealthy = B_TRUE; break; - case 'i': - cb.cb_print_vdev_init = B_TRUE; - break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; + case 'i': + cb.cb_print_vdev_init = B_TRUE; + break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; @@ -9261,22 +9342,19 @@ zpool_do_status(int argc, char **argv) case 's': cb.cb_print_slow_ios = B_TRUE; break; - case 'v': - cb.cb_verbose = B_TRUE; - break; - case 'x': - cb.cb_explain = B_TRUE; - break; - case 'D': - cb.cb_dedup_stats = B_TRUE; - break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; - case POWER_OPT: + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'x': + cb.cb_explain = B_TRUE; + break; + case ZPOOL_OPTION_POWER: cb.cb_print_power = B_TRUE; break; case '?': @@ -9315,7 +9393,6 @@ zpool_do_status(int argc, char **argv) if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); - if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) @@ -10752,11 +10829,10 @@ zpool_do_get(int argc, char **argv) } } else { /* - * The first arg isn't a pool name, + * The first arg isn't the name of a valid pool. */ - fprintf(stderr, gettext("missing pool name.\n")); - fprintf(stderr, "\n"); - usage(B_FALSE); + fprintf(stderr, gettext("Cannot get properties of %s: " + "no such pool available.\n"), argv[0]); return (1); } diff --git a/cmd/ztest.c b/cmd/ztest.c index a07bb9eca242..56eb01618c2e 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -3375,7 +3375,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) "log" : NULL, raidz_children, zs->zs_mirrors, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); switch (error) { @@ -3438,7 +3438,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, raidz_children, zs->zs_mirrors, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); if (error == ENOSPC) @@ -3545,7 +3545,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); switch (error) { case 0: @@ -8048,7 +8048,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) ztest_expand_io_t *thread_args; ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); - ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; /* Setup a 1 MiB buffer of random data */ diff --git a/config/Substfiles.am b/config/Substfiles.am index 38e870b2f501..2459637abe6e 100644 --- a/config/Substfiles.am +++ b/config/Substfiles.am @@ -18,6 +18,7 @@ subst_sed_cmd = \ -e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \ -e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \ -e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \ + -e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \ -e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \ -e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \ -e 's|@PYTHON[@]|$(PYTHON)|g' \ @@ -43,4 +44,4 @@ SUBSTFILES = CLEANFILES += $(SUBSTFILES) dist_noinst_DATA += $(SUBSTFILES:=.in) -$(call SUBST,%,) +$(SUBSTFILES): $(call SUBST,%,) diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index 9b123b1b2db1..98c1cc230205 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] ) - AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ - AS_IF([test "x$enable_pyzfs" = xyes], [ - AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test "x$enable_pyzfs" != xno], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION]) + ], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true]) + AS_IF([test "x$ax_python_devel_found" = xno], [ enable_pyzfs=no ]) ]) diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index f6d4b01444d6..1f480db6d233 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -4,18 +4,13 @@ # # SYNOPSIS # -# AX_PYTHON_DEVEL([version], [action-if-not-found]) +# AX_PYTHON_DEVEL([version[,optional]]) # # DESCRIPTION # # Note: Defines as a precious variable "PYTHON_VERSION". Don't override it # in your configure.ac. # -# Note: this is a slightly modified version of the original AX_PYTHON_DEVEL -# macro which accepts an additional [action-if-not-found] argument. This -# allow to detect if Python development is available without aborting the -# configure phase with an hard error in case it is not. -# # This macro checks for Python and tries to get the include path to # 'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output # variables. It also exports $(PYTHON_EXTRA_LIBS) and @@ -28,6 +23,11 @@ # version number. Don't use "PYTHON_VERSION" for this: that environment # variable is declared as precious and thus reserved for the end-user. # +# By default this will fail if it does not detect a development version of +# python. If you want it to continue, set optional to true, like +# AX_PYTHON_DEVEL([], [true]). The ax_python_devel_found variable will be +# "no" if it fails. +# # This macro should work for all versions of Python >= 2.1.0. As an end # user, you can disable the check for the python version by setting the # PYTHON_NOVERSIONCHECK environment variable to something else than the @@ -45,7 +45,6 @@ # Copyright (c) 2009 Matteo Settenvini # Copyright (c) 2009 Horst Knorr # Copyright (c) 2013 Daniel Mullner -# Copyright (c) 2018 loli10K # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -73,10 +72,18 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 21 +#serial 36 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ + # Get whether it's optional + if test -z "$2"; then + ax_python_devel_optional=false + else + ax_python_devel_optional=$2 + fi + ax_python_devel_found=yes + # # Allow the use of a (user set) custom python version # @@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[ AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]]) if test -z "$PYTHON"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path]) - PYTHON_VERSION="" - ]) + AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up, python development not available]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" fi - # - # Check for a version of Python >= 2.1.0 - # - AC_MSG_CHECKING([for a version of Python >= '2.1.0']) - ac_supports_python_ver=`$PYTHON -c "import sys; \ + if test $ax_python_devel_found = yes; then + # + # Check for a version of Python >= 2.1.0 + # + AC_MSG_CHECKING([for a version of Python >= '2.1.0']) + ac_supports_python_ver=`$PYTHON -c "import sys; \ ver = sys.version.split ()[[0]]; \ print (ver >= '2.1.0')"` - if test "$ac_supports_python_ver" != "True"; then + if test "$ac_supports_python_ver" != "True"; then if test -z "$PYTHON_NOVERSIONCHECK"; then AC_MSG_RESULT([no]) - AC_MSG_FAILURE([ + AC_MSG_WARN([ This version of the AC@&t@_PYTHON_DEVEL macro doesn't work properly with versions of Python before 2.1.0. You may need to re-run configure, setting the @@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand. Moreover, to disable this check, set PYTHON_NOVERSIONCHECK to something else than an empty string. ]) + if ! $ax_python_devel_optional; then + AC_MSG_FAILURE([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" else AC_MSG_RESULT([skip at user request]) fi - else + else AC_MSG_RESULT([yes]) + fi fi - # - # If the macro parameter ``version'' is set, honour it. - # A Python shim class, VPy, is used to implement correct version comparisons via - # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for - # Python 2.7.10 (the ".1" being evaluated as less than ".3"). - # - if test -n "$1"; then + if test $ax_python_devel_found = yes; then + # + # If the macro parameter ``version'' is set, honour it. + # A Python shim class, VPy, is used to implement correct version comparisons via + # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for + # Python 2.7.10 (the ".1" being evaluated as less than ".3"). + # + if test -n "$1"; then AC_MSG_CHECKING([for a version of Python $1]) cat << EOF > ax_python_devel_vpy.py class VPy: @@ -133,7 +150,7 @@ class VPy: return tuple(map(int, s.strip().replace("rc", ".").split("."))) def __init__(self): import sys - self.vpy = tuple(sys.version_info) + self.vpy = tuple(sys.version_info)[[:3]] def __eq__(self, s): return self.vpy == self.vtup(s) def __ne__(self, s): @@ -155,25 +172,69 @@ EOF AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) - AC_MSG_ERROR([this package requires Python $1. + AC_MSG_WARN([this package requires Python $1. If you have it installed, but it isn't the default Python interpreter in your system path, please pass the PYTHON_VERSION variable to configure. See ``configure --help'' for reference. ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no PYTHON_VERSION="" fi + fi fi - # - # Check for Python include path - # - # - AC_MSG_CHECKING([for Python include path]) - if test -z "$PYTHON_CPPFLAGS"; then - python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('include'));"` - plat_python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('platinclude'));"` + if test $ax_python_devel_found = yes; then + # + # Check if you have distutils, else fail + # + AC_MSG_CHECKING([for the sysconfig Python package]) + ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="import sysconfig" + else + AC_MSG_RESULT([no]) + + AC_MSG_CHECKING([for the distutils Python package]) + ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="from distutils import sysconfig" + else + AC_MSG_WARN([cannot import Python module "distutils". +Please check your Python installation. The error was: +$ac_sysconfig_result]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi + fi + fi + + if test $ax_python_devel_found = yes; then + # + # Check for Python include path + # + AC_MSG_CHECKING([for Python include path]) + if test -z "$PYTHON_CPPFLAGS"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + # sysconfig module has different functions + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('include'));"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('platinclude'));"` + else + # old distutils way + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc ());"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc (plat_specific=1));"` + fi if test -n "${python_path}"; then if test "${plat_python_path}" != "${python_path}"; then python_path="-I$python_path -I$plat_python_path" @@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference. fi fi PYTHON_CPPFLAGS=$python_path - fi - AC_MSG_RESULT([$PYTHON_CPPFLAGS]) - AC_SUBST([PYTHON_CPPFLAGS]) + fi + AC_MSG_RESULT([$PYTHON_CPPFLAGS]) + AC_SUBST([PYTHON_CPPFLAGS]) - # - # Check for Python library path - # - AC_MSG_CHECKING([for Python library path]) - if test -z "$PYTHON_LIBS"; then + # + # Check for Python library path + # + AC_MSG_CHECKING([for Python library path]) + if test -z "$PYTHON_LIBS"; then # (makes two attempts to ensure we've got a version number # from the interpreter) ac_python_version=`cat</dev/null || \ - $PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('purelib'));"` - fi - AC_MSG_RESULT([$PYTHON_SITE_PKG]) - AC_SUBST([PYTHON_SITE_PKG]) + if test $ax_python_devel_found = yes; then + AC_MSG_RESULT([$PYTHON_LIBS]) + AC_SUBST([PYTHON_LIBS]) - # - # libraries which must be linked in when embedding - # - AC_MSG_CHECKING(python extra libraries) - if test -z "$PYTHON_EXTRA_LIBS"; then - PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \ + # + # Check for site packages + # + AC_MSG_CHECKING([for Python site-packages path]) + if test -z "$PYTHON_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(0,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_SITE_PKG]) + AC_SUBST([PYTHON_SITE_PKG]) + + # + # Check for platform-specific site packages + # + AC_MSG_CHECKING([for Python platform specific site-packages path]) + if test -z "$PYTHON_PLATFORM_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(1,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG]) + AC_SUBST([PYTHON_PLATFORM_SITE_PKG]) + + # + # libraries which must be linked in when embedding + # + AC_MSG_CHECKING(python extra libraries) + if test -z "$PYTHON_EXTRA_LIBS"; then + PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LIBS') + ' ' + conf('SYSLIBS'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) - AC_SUBST(PYTHON_EXTRA_LIBS) + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) + AC_SUBST(PYTHON_EXTRA_LIBS) - # - # linking flags needed when embedding - # - AC_MSG_CHECKING(python extra linking flags) - if test -z "$PYTHON_EXTRA_LDFLAGS"; then - PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \ + # + # linking flags needed when embedding + # + AC_MSG_CHECKING(python extra linking flags) + if test -z "$PYTHON_EXTRA_LDFLAGS"; then + PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LINKFORSHARED'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) - AC_SUBST(PYTHON_EXTRA_LDFLAGS) + # Hack for macos, it sticks this in here. + PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'` + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) + AC_SUBST(PYTHON_EXTRA_LDFLAGS) - # - # final check to see if everything compiles alright - # - AC_MSG_CHECKING([consistency of all components of python development environment]) - # save current global flags - ac_save_LIBS="$LIBS" - ac_save_LDFLAGS="$LDFLAGS" - ac_save_CPPFLAGS="$CPPFLAGS" - LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS" - LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" - CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([ + # + # final check to see if everything compiles alright + # + AC_MSG_CHECKING([consistency of all components of python development environment]) + # save current global flags + ac_save_LIBS="$LIBS" + ac_save_LDFLAGS="$LDFLAGS" + ac_save_CPPFLAGS="$CPPFLAGS" + LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS" + LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" + CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" + AC_LANG_PUSH([C]) + AC_LINK_IFELSE([ AC_LANG_PROGRAM([[#include ]], [[Py_Initialize();]]) ],[pythonexists=yes],[pythonexists=no]) - AC_LANG_POP([C]) - # turn back to default flags - CPPFLAGS="$ac_save_CPPFLAGS" - LIBS="$ac_save_LIBS" - LDFLAGS="$ac_save_LDFLAGS" + AC_LANG_POP([C]) + # turn back to default flags + CPPFLAGS="$ac_save_CPPFLAGS" + LIBS="$ac_save_LIBS" + LDFLAGS="$ac_save_LDFLAGS" - AC_MSG_RESULT([$pythonexists]) + AC_MSG_RESULT([$pythonexists]) - if test ! "x$pythonexists" = "xyes"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_FAILURE([ + if test ! "x$pythonexists" = "xyes"; then + AC_MSG_WARN([ Could not link test program to Python. Maybe the main Python library has been installed in some non-standard library path. If so, pass it to configure, via the LIBS environment variable. @@ -340,9 +453,13 @@ EOD` You probably have to install the development version of the Python package for your distribution. The exact name of this package varies among them. ============================================================================ - ]) - PYTHON_VERSION="" - ]) + ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi fi # diff --git a/config/find_system_library.m4 b/config/find_system_library.m4 index 310b44112aea..8b98bd67d2ee 100644 --- a/config/find_system_library.m4 +++ b/config/find_system_library.m4 @@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [ AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]]) $7 ],[dnl ELSE - AC_SUBST([$1]_CFLAGS, []) - AC_SUBST([$1]_LIBS, []) + AC_SUBST([$1]_CFLAGS, [""]) + AC_SUBST([$1]_LIBS, [""]) AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations]) $8 ]) diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index bb5903b313eb..15dbe1c7dff0 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ (void) blk_mq_alloc_tag_set(&tag_set); return BLK_STS_OK; ], []) + ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [ + #include + #include + ], [ + struct request rq = {0}; + struct blk_mq_hw_ctx *hctx = NULL; + rq.mq_hctx = hctx; + ], []) ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ @@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ ZFS_LINUX_TEST_RESULT([blk_mq], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request]) + ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request]) + ], [ + AC_MSG_RESULT(no) + ]) ], [ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index c5a353ca9203..b6ce1e1cf083 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # bdev_file_open_by_path() replaced bdev_open_by_path(), +dnl # and returns struct file* +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [ + ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [ + #include + #include + ], [ + struct file *file __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + file = bdev_file_open_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ @@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ [bdev_open_by_path() exists]) AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bdev_file_open_by_path() exists]) + ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [ + AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1, + [bdev_file_open_by_path() exists]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) ]) ]) ]) @@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # +dnl # bdev_release() now private, but because bdev_file_open_by_path() returns +dnl # struct file*, we can just use fput(). So the blkdev_put test no longer +dnl # fails if not found. +dnl # + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) @@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_DEFINE(HAVE_BDEV_RELEASE, 1, [bdev_release() exists]) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_RESULT(no) ]) ]) ]) @@ -523,12 +561,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ ]) dnl # -dnl # 5.19 API: blkdev_issue_secure_erase() -dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) -dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # TRIM support: discard and secure erase. We make use of asynchronous +dnl # functions when available. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ - ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ +dnl # 3.10: +dnl # sync discard: blkdev_issue_discard(..., 0) +dnl # sync erase: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # async discard: [not available] +dnl # async erase: [not available] +dnl # +dnl # 4.7: +dnl # sync discard: blkdev_issue_discard(..., 0) +dnl # sync erase: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # async discard: __blkdev_issue_discard(..., 0) +dnl # async erase: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) +dnl # +dnl # 5.19: +dnl # sync discard: blkdev_issue_discard(...) +dnl # sync erase: blkdev_issue_secure_erase(...) +dnl # async discard: __blkdev_issue_discard(...) +dnl # async erase: [not available] +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [ + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [ #include ],[ struct block_device *bdev = NULL; @@ -536,10 +591,33 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ sector_t nr_sects = 0; int error __attribute__ ((unused)); - error = blkdev_issue_secure_erase(bdev, + error = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + int error __attribute__ ((unused)); + error = blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, &biop); + ]) ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ #include ],[ @@ -553,22 +631,52 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ error = __blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL, flags, &biop); ]) - - ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ + ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; - unsigned long flags = 0; int error __attribute__ ((unused)); - error = blkdev_issue_discard(bdev, - sector, nr_sects, GFP_KERNEL, flags); + error = blkdev_issue_secure_erase(bdev, + sector, nr_sects, GFP_KERNEL); ]) ]) -AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [ + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1, + [blkdev_issue_discard() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1, + [blkdev_issue_discard(flags) is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1, + [__blkdev_issue_discard() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1, + [__blkdev_issue_discard(flags) is available]) + ],[ + AC_MSG_RESULT(no) + ]) AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [ AC_MSG_RESULT(yes) @@ -576,24 +684,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ [blkdev_issue_secure_erase() is available]) ],[ AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, - [__blkdev_issue_discard() is available]) - ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) - ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) - ]) - ]) ]) ]) @@ -645,6 +735,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH + ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE @@ -657,7 +748,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME - ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE @@ -678,7 +769,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS - ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4 index 745928168f92..0b7da828d299 100644 --- a/config/kernel-filemap.m4 +++ b/config/kernel-filemap.m4 @@ -4,6 +4,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [ ZFS_LINUX_TEST_SRC([filemap_range_has_page], [ #include + #include ],[ struct address_space *mapping = NULL; loff_t lstart = 0; diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 index 4d20dd45c4a1..9813ad2fb3f3 100644 --- a/config/kernel-make-request-fn.m4 +++ b/config/kernel-make-request-fn.m4 @@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ disk = blk_alloc_disk(NUMA_NO_NODE); ]) + ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [ + #include + ],[ + struct queue_limits *lim = NULL; + struct gendisk *disk __attribute__ ((unused)); + disk = blk_alloc_disk(lim, NUMA_NO_NODE); + ]) + ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [ #include ],[ @@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ ], [ AC_MSG_RESULT(no) ]) + + dnl # + dnl # Linux 6.9 API Change: + dnl # blk_alloc_queue() takes a nullable queue_limits arg. + dnl # + AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args]) + ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args]) + + dnl # + dnl # 5.20 API change, + dnl # Removed blk_cleanup_disk(), put_disk() should be used. + dnl # + AC_MSG_CHECKING([whether blk_cleanup_disk() exists]) + ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1, + [blk_cleanup_disk() exists]) + ], [ + AC_MSG_RESULT(no) + ]) + ], [ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4 new file mode 100644 index 000000000000..d5ebd926986a --- /dev/null +++ b/config/kernel-mm-page-size.m4 @@ -0,0 +1,17 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ + ZFS_LINUX_TEST_SRC([page_size], [ + #include + ],[ + unsigned long s; + s = page_size(NULL); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ + AC_MSG_CHECKING([whether page_size() is available]) + ZFS_LINUX_TEST_RESULT([page_size], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4 index cc96404d8bbe..8a5cbe2eeeed 100644 --- a/config/kernel-vfs-file_range.m4 +++ b/config/kernel-vfs-file_range.m4 @@ -16,6 +16,9 @@ dnl # dnl # 5.3: VFS copy_file_range() expected to do its own fallback, dnl # generic_copy_file_range() added to support it dnl # +dnl # 6.8: generic_copy_file_range() removed, replaced by +dnl # splice_copy_file_range() +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [ ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [ #include @@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [ ]) ]) +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [ + ZFS_LINUX_TEST_SRC([splice_copy_file_range], [ + #include + ], [ + struct file *src_file __attribute__ ((unused)) = NULL; + loff_t src_off __attribute__ ((unused)) = 0; + struct file *dst_file __attribute__ ((unused)) = NULL; + loff_t dst_off __attribute__ ((unused)) = 0; + size_t len __attribute__ ((unused)) = 0; + splice_copy_file_range(src_file, src_off, dst_file, dst_off, + len); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [ + AC_MSG_CHECKING([whether splice_copy_file_range() is available]) + ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1, + [splice_copy_file_range() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [ ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [ #include diff --git a/config/kernel.m4 b/config/kernel.m4 index e3f8645774c5..548905ccd04d 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE @@ -166,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_SYNC_BDEV + ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -266,6 +268,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE + ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE @@ -314,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_SYNC_BDEV + ZFS_AC_KERNEL_MM_PAGE_SIZE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/config/user-backtrace.m4 b/config/user-backtrace.m4 new file mode 100644 index 000000000000..25706767cdc3 --- /dev/null +++ b/config/user-backtrace.m4 @@ -0,0 +1,14 @@ +dnl +dnl backtrace(), for userspace assertions. glibc has this directly in libc. +dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed +dnl that this will also get the companion function backtrace_symbols(). +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [ + AX_SAVE_FLAGS + LIBS="" + AC_SEARCH_LIBS([backtrace], [execinfo], [ + AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available]) + AC_SUBST([BACKTRACE_LIBS], ["$LIBS"]) + ]) + AX_RESTORE_FLAGS +]) diff --git a/config/user-libunwind.m4 b/config/user-libunwind.m4 new file mode 100644 index 000000000000..99ba3dcf452d --- /dev/null +++ b/config/user-libunwind.m4 @@ -0,0 +1,44 @@ +dnl +dnl Checks for libunwind, which usually does a better job than backtrace() when +dnl resolving symbols in the stack backtrace. Newer versions have support for +dnl getting info about the object file the function came from, so we look for +dnl that too and use it if found. +dnl +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [ + AC_ARG_WITH([libunwind], + AS_HELP_STRING([--with-libunwind], + [use libunwind for backtraces in userspace assertions]), + [], + [with_libunwind=auto]) + + AS_IF([test "x$with_libunwind" != "xno"], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [ + dnl unw_get_elf_filename() is sometimes a macro, other + dnl times a proper symbol, so we can't just do a link + dnl check; we need to include the header properly. + AX_SAVE_FLAGS + CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS" + LIBS="$LIBS $LIBUNWIND_LIBS" + AC_MSG_CHECKING([for unw_get_elf_filename in libunwind]) + AC_LINK_IFELSE([ + AC_LANG_PROGRAM([ + #define UNW_LOCAL_ONLY + #include + ], [ + unw_get_elf_filename(0, 0, 0, 0); + ]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LIBUNWIND_ELF, 1, + [libunwind has unw_get_elf_filename]) + ], [ + AC_MSG_RESULT([no]) + ]) + AX_RESTORE_FLAGS + ], [ + AS_IF([test "x$with_libunwind" = "xyes"], [ + AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel]) + ]) + ]) + ]) +]) diff --git a/config/user.m4 b/config/user.m4 index 87df8c7ccabd..badd920d2b8a 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -26,12 +26,14 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_AIO_H ZFS_AC_CONFIG_USER_CLOCK_GETTIME ZFS_AC_CONFIG_USER_PAM + ZFS_AC_CONFIG_USER_BACKTRACE + ZFS_AC_CONFIG_USER_LIBUNWIND ZFS_AC_CONFIG_USER_RUNSTATEDIR ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 5f36569fe25b..bb5a85d815d1 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - alpine) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - *) DEFAULT_INIT_SHELL="/bin/sh" ;; + gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + IS_SYSV_RC=false ;; + *) DEFAULT_INIT_SHELL=/bin/sh + IS_SYSV_RC=true ;; esac AC_MSG_RESULT([$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SHELL) + AC_SUBST(IS_SYSV_RC) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], diff --git a/contrib/debian/control b/contrib/debian/control index 98beb900d0fa..e56fbf0f1c93 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -189,7 +189,7 @@ Depends: dkms (>> 2.1.1.2-5), file, libc6-dev | libc-dev, lsb-release, - python3-distutils | libpython3-stdlib (<< 3.6.4), + python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4), ${misc:Depends}, ${perl:Depends} Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends} diff --git a/etc/init.d/README.md b/etc/init.d/README.md index 2de05042ce63..da780fdc1222 100644 --- a/etc/init.d/README.md +++ b/etc/init.d/README.md @@ -7,11 +7,7 @@ DESCRIPTION They have been tested successfully on: - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie - * Ubuntu Trusty - * CentOS 6.0 - * CentOS 6.6 + * Debian GNU/Linux Bookworm * Gentoo SUPPORT diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in index a9a0604f81ac..ff169eb96d86 100755 --- a/etc/init.d/zfs-import.in +++ b/etc/init.d/zfs-import.in @@ -307,7 +307,7 @@ do_start() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-load-key.in b/etc/init.d/zfs-load-key.in index 53c7766b793a..27dfeeb0bcc5 100755 --- a/etc/init.d/zfs-load-key.in +++ b/etc/init.d/zfs-load-key.in @@ -104,7 +104,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in index a0825f19fcdd..6a3ca5f86908 100755 --- a/etc/init.d/zfs-mount.in +++ b/etc/init.d/zfs-mount.in @@ -114,7 +114,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in index 88978071cbf6..06c59c620b75 100755 --- a/etc/init.d/zfs-share.in +++ b/etc/init.d/zfs-share.in @@ -57,7 +57,8 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in index e9cf8867403c..3d40600cea5d 100755 --- a/etc/init.d/zfs-zed.in +++ b/etc/init.d/zfs-zed.in @@ -93,7 +93,8 @@ do_reload() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/include/libzfs.h b/include/libzfs.h index 4f06b5d3c24c..2823b8845827 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2022 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. @@ -158,6 +158,7 @@ typedef enum zfs_error { EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ + EZFS_ASHIFT_MISMATCH, /* can't add vdevs with different ashifts */ EZFS_UNKNOWN } zfs_error_t; @@ -261,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *); _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); -_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *); +_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/libzutil.h b/include/libzutil.h index d9a9a65753dd..e2108ceeaa44 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018, 2024 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H @@ -79,6 +79,8 @@ typedef struct importargs { boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ + boolean_t do_destroyed; + boolean_t do_all; } importargs_t; typedef struct libpc_handle { diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 9819e534b7f6..292f79b8ce72 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -4,8 +4,6 @@ noinst_HEADERS = \ \ %D%/spl/acl/acl_common.h \ \ - %D%/spl/rpc/xdr.h \ - \ %D%/spl/sys/ia32/asm_linkage.h \ \ %D%/spl/sys/acl.h \ @@ -22,7 +20,6 @@ noinst_HEADERS = \ %D%/spl/sys/debug.h \ %D%/spl/sys/dirent.h \ %D%/spl/sys/disp.h \ - %D%/spl/sys/dkio.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ %D%/spl/sys/freebsd_rwlock.h \ @@ -80,7 +77,9 @@ noinst_HEADERS = \ %D%/spl/sys/zmod.h \ %D%/spl/sys/zone.h \ \ + %D%/zfs/sys/arc_os.h \ %D%/zfs/sys/freebsd_crypto.h \ + %D%/zfs/sys/freebsd_event.h \ %D%/zfs/sys/vdev_os.h \ %D%/zfs/sys/zfs_bootenv_os.h \ %D%/zfs/sys/zfs_context_os.h \ diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h deleted file mode 100644 index c98466e9d16a..000000000000 --- a/include/os/freebsd/spl/rpc/xdr.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Sun RPC is a product of Sun Microsystems, Inc. and is provided for - * unrestricted use provided that this legend is included on all tape - * media and as a part of the software program in whole or part. Users - * may copy or modify Sun RPC without charge, but are not authorized - * to license or distribute it to anyone else except as part of a product or - * program developed by the user. - * - * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE - * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. - * - * Sun RPC is provided with no support and without any obligation on the - * part of Sun Microsystems, Inc. to assist in its use, correction, - * modification or enhancement. - * - * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE - * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC - * OR ANY PART THEREOF. - * - * In no event will Sun Microsystems, Inc. be liable for any lost revenue - * or profits or other special, indirect and consequential damages, even if - * Sun has been advised of the possibility of such damages. - * - * Sun Microsystems, Inc. - * 2550 Garcia Avenue - * Mountain View, California 94043 - */ - -#ifndef _OPENSOLARIS_RPC_XDR_H_ -#define _OPENSOLARIS_RPC_XDR_H_ - -#include -#include_next - -#if !defined(_KERNEL) && !defined(_STANDALONE) - -#include - -/* - * Taken from sys/xdr/xdr_mem.c. - * - * FreeBSD's userland XDR doesn't implement control method (only the kernel), - * but OpenSolaris nvpair still depend on it, so we have to implement it here. - */ -static __inline bool_t -xdrmem_control(XDR *xdrs, int request, void *info) -{ - xdr_bytesrec *xptr; - - switch (request) { - case XDR_GET_BYTES_AVAIL: - xptr = (xdr_bytesrec *)info; - xptr->xc_is_last_record = TRUE; - xptr->xc_num_avail = xdrs->x_handy; - return (TRUE); - default: - assert(!"unexpected request"); - } - return (FALSE); -} - -#undef XDR_CONTROL -#define XDR_CONTROL(xdrs, req, op) \ - (((xdrs)->x_ops->x_control == NULL) ? \ - xdrmem_control((xdrs), (req), (op)) : \ - (*(xdrs)->x_ops->x_control)(xdrs, req, op)) - -#endif /* !_KERNEL && !_STANDALONE */ - -#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index 785fcf62dd16..f041dde34fc8 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -56,11 +56,33 @@ /* * Common DEBUG functionality. */ +#ifdef __FreeBSD__ +#include +#endif + +#ifndef __printflike +#define __printflike(a, b) __printf(a, b) +#endif + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +/* + * Without this, we see warnings from objtool during normal Linux builds when + * the kernel is built with CONFIG_STACK_VALIDATION=y: + * + * warning: objtool: tsd_create() falls through to next function __list_add() + * warning: objtool: .text: unexpected end of section + * + * Until the toolchain stops doing this, we must only define this attribute on + * spl_panic() when doing static analysis. + */ #if defined(__COVERITY__) || defined(__clang_analyzer__) __attribute__((__noreturn__)) #endif extern void spl_panic(const char *file, const char *func, int line, - const char *fmt, ...) __attribute__((__noreturn__)); + const char *fmt, ...); extern void spl_dumpstack(void); static inline int @@ -73,8 +95,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif +#ifndef __linux__ #define likely(expr) expect((expr) != 0, 1) #define unlikely(expr) expect((expr) != 0, 0) +#endif #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) @@ -84,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("VERIFY(" #cond ") failed\n", \ __FILE__, __FUNCTION__, __LINE__)) +#define VERIFYF(cond, str, ...) do { \ + if (unlikely(!cond)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ + } while (0) + #define VERIFY3B(LEFT, OP, RIGHT) do { \ const boolean_t _verify3_left = (boolean_t)(LEFT); \ const boolean_t _verify3_right = (boolean_t)(RIGHT); \ @@ -123,7 +153,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%p " #OP " %p)\n", \ + "failed (%px " #OP " %px)\n", \ (void *)_verify3_left, \ (void *)_verify3_right); \ } while (0) @@ -142,10 +172,98 @@ spl_assert(const char *buf, const char *file, const char *func, int line) if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY0P(" #RIGHT ") " \ - "failed (NULL == %p)\n", \ + "failed (NULL == %px)\n", \ (void *)_verify0_right); \ } while (0) +/* + * Note that you should not put any operations you want to always happen + * in the print section for ASSERTs unless you only want them to run on + * debug builds! + * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug + * builds. + */ + +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d) " STR "\n", \ + (boolean_t)(_verify3_left), \ + (boolean_t)(_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld) " STR "\n", \ + (long long)(_verify3_left), \ + (long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu) " STR "\n", \ + (unsigned long long)(_verify3_left), \ + (unsigned long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px) " STR "\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0PF(RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(0); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %px) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0F(RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %lld) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY_IMPLY(A, B) \ + ((void)(likely((!(A)) || (B)) || \ + spl_assert("(" #A ") implies (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + +#define VERIFY_EQUIV(A, B) \ + ((void)(likely(!!(A) == !!(B)) || \ + spl_assert("(" #A ") is equivalent to (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + /* * Debugging disabled (--disable-debug) */ @@ -162,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) +#define ASSERTF(x, str, ...) ASSERT(x) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) #define EQUIV(A, B) \ @@ -178,16 +303,16 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F +#define ASSERTF VERIFYF #define ASSERT VERIFY -#define IMPLY(A, B) \ - ((void)(likely((!(A)) || (B)) || \ - spl_assert("(" #A ") implies (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) -#define EQUIV(A, B) \ - ((void)(likely(!!(A) == !!(B)) || \ - spl_assert("(" #A ") is equivalent to (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) - +#define IMPLY VERIFY_IMPLY +#define EQUIV VERIFY_EQUIV #endif /* NDEBUG */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 3830d198dfff..f31ae50b96af 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -5,6 +5,7 @@ kernel_linux_HEADERS = \ %D%/kernel/linux/compiler_compat.h \ %D%/kernel/linux/dcache_compat.h \ %D%/kernel/linux/kmap_compat.h \ + %D%/kernel/linux/mm_compat.h \ %D%/kernel/linux/mod_compat.h \ %D%/kernel/linux/page_compat.h \ %D%/kernel/linux/percpu_compat.h \ @@ -46,6 +47,7 @@ kernel_sys_HEADERS = \ kernel_spl_rpcdir = $(kerneldir)/spl/rpc kernel_spl_rpc_HEADERS = \ + %D%/spl/rpc/types.h \ %D%/spl/rpc/xdr.h kernel_spl_sysdir = $(kerneldir)/spl/sys @@ -61,7 +63,6 @@ kernel_spl_sys_HEADERS = \ %D%/spl/sys/ctype.h \ %D%/spl/sys/debug.h \ %D%/spl/sys/disp.h \ - %D%/spl/sys/dkio.h \ %D%/spl/sys/errno.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index f111e648ccf7..658f546213de 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -94,6 +94,33 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) #endif } +/* + * Detect if a device has a write cache. Used to set the intial value for the + * vdev nowritecache flag. + * + * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed + * later by the operator. If not set, kernel will return flush requests + * immediately without doing anything. + * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed. + * Only controls if the operator is allowed to change _WC. Initial version + * buggy; aliased to QUEUE_FLAG_FUA, so unuseable. + * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed. + * + * Older than 4.10 we just assume write cache, and let the normal flush fail + * detection apply. + */ +static inline boolean_t +zfs_bdev_has_write_cache(struct block_device *bdev) +{ +#if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA + return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags)); +#elif defined(QUEUE_FLAG_WC) + return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags)); +#else + return (B_TRUE); +#endif +} + static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { @@ -563,9 +590,11 @@ static inline boolean_t bdev_discard_supported(struct block_device *bdev) { #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS) - return (!!bdev_max_discard_sectors(bdev)); + return (bdev_max_discard_sectors(bdev) > 0 && + bdev_discard_granularity(bdev) > 0); #elif defined(HAVE_BLK_QUEUE_DISCARD) - return (!!blk_queue_discard(bdev_get_queue(bdev))); + return (blk_queue_discard(bdev_get_queue(bdev)) > 0 && + bdev_get_queue(bdev)->limits.discard_granularity > 0); #else #error "Unsupported kernel" #endif diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/linux/kernel/linux/mm_compat.h similarity index 71% rename from include/os/freebsd/spl/sys/dkio.h rename to include/os/linux/kernel/linux/mm_compat.h index cd747089d422..40056c68d6dd 100644 --- a/include/os/freebsd/spl/sys/dkio.h +++ b/include/os/linux/kernel/linux/mm_compat.h @@ -17,18 +17,20 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - * - * $FreeBSD$ */ + /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2023, 2024, Klara Inc. */ -#ifndef _OPENSOLARIS_SYS_DKIO_H_ -#define _OPENSOLARIS_SYS_DKIO_H_ +#ifndef _ZFS_MM_COMPAT_H +#define _ZFS_MM_COMPAT_H + +#include -#define DKIOC (0x04 << 8) -#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ +#ifndef HAVE_MM_PAGE_SIZE +#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) +#endif -#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ +#endif /* _ZFS_MM_COMPAT_H */ diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index 8e20a9613539..039865b703ef 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -68,6 +68,7 @@ enum scope_prefix_types { zfs_trim, zfs_txg, zfs_vdev, + zfs_vdev_disk, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/include/os/linux/spl/sys/dkio.h b/include/os/linux/spl/rpc/types.h similarity index 50% rename from include/os/linux/spl/sys/dkio.h rename to include/os/linux/spl/rpc/types.h index a90b67d36702..5bbb4f2dec46 100644 --- a/include/os/linux/spl/sys/dkio.h +++ b/include/os/linux/spl/rpc/types.h @@ -1,9 +1,6 @@ /* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 + * Copyright (c) 2008 Sun Microsystems, Inc. + * Written by Ricardo Correia * * This file is part of the SPL, Solaris Porting Layer. * @@ -21,19 +18,13 @@ * with the SPL. If not, see . */ -#ifndef _SPL_DKIO_H -#define _SPL_DKIO_H +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H -#define DFL_SZ(num_exts) \ - (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) +#include -#define DKIOC (0x04 << 8) -#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +/* Just enough to support rpc/xdr.h */ -/* - * ioctl to free space (e.g. SCSI UNMAP) off a disk. - * Pass a dkioc_free_list_t containing a list of extents to be freed. - */ -#define DKIOCFREE (DKIOC|50) +typedef int bool_t; -#endif /* _SPL_DKIO_H */ +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h index b00f3542fcdf..5b621fa9c863 100644 --- a/include/os/linux/spl/rpc/xdr.h +++ b/include/os/linux/spl/rpc/xdr.h @@ -23,8 +23,6 @@ #include -typedef int bool_t; - /* * XDR enums and types. */ diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h index 288193ad21c5..f041dde34fc8 100644 --- a/include/os/linux/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -1,24 +1,29 @@ /* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf . - * UCRL-CODE-235197 + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. * - * This file is part of the SPL, Solaris Porting Layer. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see . + * $FreeBSD$ */ /* @@ -47,10 +52,17 @@ #ifndef _SPL_DEBUG_H #define _SPL_DEBUG_H + /* * Common DEBUG functionality. */ +#ifdef __FreeBSD__ +#include +#endif + +#ifndef __printflike #define __printflike(a, b) __printf(a, b) +#endif #ifndef __maybe_unused #define __maybe_unused __attribute__((unused)) @@ -80,6 +92,14 @@ spl_assert(const char *buf, const char *file, const char *func, int line) return (0); } +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#ifndef __linux__ +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) +#endif + #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) @@ -88,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("VERIFY(" #cond ") failed\n", \ __FILE__, __FUNCTION__, __LINE__)) +#define VERIFYF(cond, str, ...) do { \ + if (unlikely(!cond)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\ + } while (0) + #define VERIFY3B(LEFT, OP, RIGHT) do { \ const boolean_t _verify3_left = (boolean_t)(LEFT); \ const boolean_t _verify3_right = (boolean_t)(RIGHT); \ @@ -150,6 +176,84 @@ spl_assert(const char *buf, const char *file, const char *func, int line) (void *)_verify0_right); \ } while (0) +/* + * Note that you should not put any operations you want to always happen + * in the print section for ASSERTs unless you only want them to run on + * debug builds! + * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug + * builds. + */ + +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d) " STR "\n", \ + (boolean_t)(_verify3_left), \ + (boolean_t)(_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld) " STR "\n", \ + (long long)(_verify3_left), \ + (long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu) " STR "\n", \ + (unsigned long long)(_verify3_left), \ + (unsigned long long)(_verify3_right), \ + __VA_ARGS); \ + } while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px) " STR "\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0PF(RIGHT, STR, ...) do { \ + const uintptr_t _verify3_left = (uintptr_t)(0); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %px) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + +#define VERIFY0F(RIGHT, STR, ...) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0(0 == " #RIGHT ") " \ + "failed (0 == %lld) " STR "\n", \ + (long long) (_verify3_right), \ + __VA_ARGS__); \ + } while (0) + #define VERIFY_IMPLY(A, B) \ ((void)(likely((!(A)) || (B)) || \ spl_assert("(" #A ") implies (" #B ")", \ @@ -176,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) +#define ASSERTF(x, str, ...) ASSERT(x) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) #define EQUIV(A, B) \ @@ -192,6 +303,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F +#define ASSERTF VERIFYF #define ASSERT VERIFY #define IMPLY VERIFY_IMPLY #define EQUIV VERIFY_EQUIV diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h index 3d4b1920d598..6ffa57c86418 100644 --- a/include/os/linux/zfs/sys/trace_common.h +++ b/include/os/linux/zfs/sys/trace_common.h @@ -31,7 +31,6 @@ /* ZIO macros */ #define ZIO_TP_STRUCT_ENTRY \ __field(zio_type_t, zio_type) \ - __field(int, zio_cmd) \ __field(zio_priority_t, zio_priority) \ __field(uint64_t, zio_size) \ __field(uint64_t, zio_orig_size) \ @@ -61,7 +60,6 @@ #define ZIO_TP_FAST_ASSIGN \ __entry->zio_type = zio->io_type; \ - __entry->zio_cmd = zio->io_cmd; \ __entry->zio_priority = zio->io_priority; \ __entry->zio_size = zio->io_size; \ __entry->zio_orig_size = zio->io_orig_size; \ @@ -90,7 +88,7 @@ __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; #define ZIO_TP_PRINTK_FMT \ - "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "zio { type %u prio %u size %llu orig_size %llu " \ "offset %llu timestamp %llu delta %llu delay %llu " \ "flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx " \ "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ @@ -98,7 +96,7 @@ "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" #define ZIO_TP_PRINTK_ARGS \ - __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_type, __entry->zio_priority, \ __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ diff --git a/include/sys/abd.h b/include/sys/abd.h index b48dc36423f7..19fe96292d5f 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); @@ -213,6 +220,8 @@ void abd_fini(void); /* * Linux ABD bio functions + * Note: these are only needed to support vdev_classic. See comment in + * vdev_disk.c. */ #if defined(__linux__) && defined(_KERNEL) unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 40546d4af137..f88ea25e245d 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #ifndef _ABD_IMPL_H @@ -38,12 +39,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 921f51f27a20..b5fed64da4ad 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -752,8 +752,6 @@ void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); -dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); -void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); @@ -902,6 +900,8 @@ extern uint_t zfs_max_recordsize; */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); +void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index f00e13cf03a6..322472fb1ae2 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -45,18 +45,24 @@ typedef struct zfetch { int zf_numstreams; /* number of zstream_t's */ } zfetch_t; +typedef struct zsrange { + uint16_t start; + uint16_t end; +} zsrange_t; + +#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */ + typedef struct zstream { + list_node_t zs_node; /* link for zf_stream */ uint64_t zs_blkid; /* expect next access at this blkid */ + uint_t zs_atime; /* time last prefetch issued */ + zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */ unsigned int zs_pf_dist; /* data prefetch distance in bytes */ unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */ uint64_t zs_pf_start; /* first data block to prefetch */ uint64_t zs_pf_end; /* data block to prefetch up to */ uint64_t zs_ipf_start; /* first data block to prefetch L1 */ uint64_t zs_ipf_end; /* data block to prefetch L1 up to */ - - list_node_t zs_node; /* link for zf_stream */ - hrtime_t zs_atime; /* time last prefetch issued */ - zfetch_t *zs_fetch; /* parent fetch */ boolean_t zs_missed; /* stream saw cache misses */ boolean_t zs_more; /* need more distant prefetch */ zfs_refcount_t zs_callers; /* number of pending callers */ @@ -74,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t); -void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t); +void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, boolean_t); diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 2e3452e5ebaa..f32f59a2bedf 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -173,6 +173,7 @@ typedef struct dsl_scan { dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ + kmutex_t scn_queue_lock; /* serializes scn_queue inserts */ uint64_t scn_queues_pending; /* outstanding data to issue */ /* members needed for syncing error scrub status to disk */ dsl_errorscrub_phys_t errorscrub_phys; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 025567e2183f..e191420f2d2d 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -1094,11 +1094,17 @@ typedef enum zio_type { ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, - ZIO_TYPE_IOCTL, + ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; +/* + * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to + * user programs. + */ +#define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH + /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. @@ -1603,6 +1609,7 @@ typedef enum { ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, + ZFS_ERR_ASHIFT_MISMATCH, } zfs_errno_t; /* diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 26f37c37ab38..e7de86f2379b 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *); unsigned int multilist_get_num_sublists(multilist_t *); unsigned int multilist_get_random_index(multilist_t *); -multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_lock(multilist_sublist_t *); +multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int); multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); void multilist_sublist_unlock(multilist_sublist_t *); void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *); +void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); int multilist_sublist_is_empty(multilist_sublist_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index cada3c841037..3073c4d1b937 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2021 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -125,15 +125,15 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | pad | vdev1 | GRID | ASIZE | + * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | pad | vdev2 | GRID | ASIZE | + * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | pad | vdev3 | GRID | ASIZE | + * 4 | pad | vdev3 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -165,7 +165,6 @@ typedef struct zio_cksum_salt { * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) - * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator @@ -190,11 +189,11 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | + * 0 | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | + * 2 | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -355,7 +354,7 @@ typedef enum bp_embedded_type { #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ - ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + ((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1])) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ @@ -374,8 +373,7 @@ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ - uint64_t blk_phys_birth; /* txg when block was allocated */ - uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; @@ -395,9 +393,6 @@ typedef struct blkptr { BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) -#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) -#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) - #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) @@ -480,15 +475,23 @@ typedef struct blkptr { #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) -#define BP_PHYSICAL_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) +#define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] +#define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) + +#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] +#define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) + +#define BP_GET_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \ + BP_GET_LOGICAL_BIRTH(bp)) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ - (bp)->blk_birth = (logical); \ - (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ + BP_SET_LOGICAL_BIRTH(bp, logical); \ + BP_SET_PHYSICAL_BIRTH(bp, \ + ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ @@ -541,8 +544,8 @@ typedef struct blkptr { (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ - (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ - (bp1)->blk_birth == (bp2)->blk_birth && \ + (BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \ + BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) @@ -581,8 +584,8 @@ typedef struct blkptr { (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ - (bp)->blk_phys_birth = 0; \ - (bp)->blk_birth = 0; \ + (bp)->blk_birth_word[0] = 0; \ + (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } @@ -631,7 +634,7 @@ typedef struct blkptr { (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ @@ -642,14 +645,14 @@ typedef struct blkptr { compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ @@ -691,8 +694,8 @@ typedef struct blkptr { ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth, \ - (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ @@ -767,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -782,7 +785,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, @@ -826,10 +829,14 @@ extern uint_t zfs_sync_pass_deferred_free; /* spa sync taskqueues */ taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); void spa_sync_tq_destroy(spa_t *spa); +uint_t spa_acq_allocator(spa_t *spa); +void spa_rel_allocator(spa_t *spa, uint_t allocator); void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; +extern avl_tree_t spa_namespace_avl; +extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c @@ -1118,6 +1125,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); @@ -1142,9 +1151,9 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, - const uint64_t *birth); + const uint64_t birth); extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, - const uint64_t *birth); + uint64_t birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0cd0c4720fbe..a40914ec5fcb 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -63,6 +63,12 @@ typedef struct spa_alloc { avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; +typedef struct spa_allocs_use { + kmutex_t sau_lock; + uint_t sau_rotor; + boolean_t sau_inuse[]; +} spa_allocs_use_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; @@ -192,7 +198,7 @@ typedef struct spa_taskqs { /* one for each thread in the spa sync taskq */ typedef struct spa_syncthread_info { kthread_t *sti_thread; - taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ + uint_t sti_allocator; } spa_syncthread_info_t; typedef enum spa_all_vdev_zap_action { @@ -237,6 +243,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ @@ -269,6 +276,7 @@ struct spa { * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; + spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index d3a71cc8f84b..e480a4bac0b9 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ @@ -165,7 +165,7 @@ struct uberblock { * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], * the value of the field is used to determine which ZIL blocks have * been allocated according to the ms_sm when we are rewinding to a - * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. */ uint64_t ub_checkpoint_txg; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index f39ebf031cea..57ff31e89eb9 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -274,7 +273,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ @@ -455,7 +454,7 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; /* - * Vdev properties for tuning ZED + * Vdev properties for tuning ZED or zfsd */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; diff --git a/include/sys/zap.h b/include/sys/zap.h index 308a7c7284d7..96ddcc324b65 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key, int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); +int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 74853f5faceb..2959aa9b2ca4 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -145,6 +145,7 @@ typedef struct zap { dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; + dnode_t *zap_dnode; struct dmu_buf *zap_dbuf; krwlock_t zap_rwlock; boolean_t zap_ismicro; diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index ebc67c2bf465..e54456d3472b 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -47,7 +47,7 @@ struct zap_stats; * entries - header space (2*chunksize) */ #define ZAP_LEAF_NUMCHUNKS_BS(bs) \ - (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ + (((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ ZAP_LEAF_CHUNKSIZE - 2) #define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs))) @@ -80,7 +80,7 @@ struct zap_stats; * chunks per entry (3). */ #define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5) -#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs)) +#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs)) #define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs))) #define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs))) @@ -132,7 +132,7 @@ typedef struct zap_leaf_phys { * with the ZAP_LEAF_CHUNK() macro. */ - uint16_t l_hash[1]; + uint16_t l_hash[]; } zap_leaf_phys_t; typedef union zap_leaf_chunk { @@ -163,7 +163,7 @@ typedef struct zap_leaf { dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1< * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner */ @@ -451,7 +451,6 @@ struct zio { zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; - int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; @@ -529,9 +528,6 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; - - /* write issue taskq selection, based upon sync thread */ - taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { @@ -579,9 +575,6 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, zio_flag_t flags); -extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *priv, zio_flag_t flags); - extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags); @@ -690,6 +683,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); +extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); +extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index febe0a87b428..2b026d48675a 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara Inc. */ #ifndef _ZIO_IMPL_H @@ -39,7 +40,7 @@ extern "C" { * * The ZFS I/O pipeline is comprised of various stages which are defined * in the zio_stage enum below. The individual stages are used to construct - * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * these basic I/O operations: Read, Write, Free, Claim, Flush and Trim. * * I/O operations: (XXX - provide detail for each of the operations) * @@ -47,7 +48,8 @@ extern "C" { * Write: * Free: * Claim: - * Ioctl: + * Flush: + * Trim: * * Although the most common pipeline are used by the basic I/O operations * above, there are some helper pipelines (one could consider them @@ -120,43 +122,43 @@ extern "C" { * zio pipeline stage definitions */ enum zio_stage { - ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ + ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ - ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ - ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R----- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W---- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F--- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* -WF--T */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W---- */ - ZIO_STAGE_ENCRYPT = 1 << 6, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W--- */ + ZIO_STAGE_ENCRYPT = 1 << 6, /* -W---- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W---- */ - ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */ - ZIO_STAGE_BRT_FREE = 1 << 9, /* --F-- */ + ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */ - ZIO_STAGE_DDT_READ_START = 1 << 10, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 13, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */ + ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */ + ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */ + ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */ - ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 18, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W---- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W---- */ + ZIO_STAGE_DVA_FREE = 1 << 18, /* --F--- */ + ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C-- */ - ZIO_STAGE_READY = 1 << 20, /* RWFCI */ + ZIO_STAGE_READY = 1 << 20, /* RWFCXT */ - ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--XT */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--XT */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ - ZIO_STAGE_DONE = 1 << 25 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ }; #define ZIO_ROOT_PIPELINE \ @@ -257,10 +259,9 @@ enum zio_stage { (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_DVA_CLAIM) -#define ZIO_IOCTL_PIPELINE \ +#define ZIO_FLUSH_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_VDEV_IO_START | \ - ZIO_STAGE_VDEV_IO_ASSESS) + ZIO_VDEV_IO_STAGES) #define ZIO_TRIM_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 822bef7e7a8d..94be416d46aa 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -1,6 +1,6 @@ include $(srcdir)/%D%/include/Makefile.am -libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS) libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS) noinst_LTLIBRARIES += libspl_assert.la libspl.la @@ -43,3 +43,9 @@ libspl_la_LIBADD = \ libspl_assert.la libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME) + +libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS) + +if BUILD_FREEBSD +libspl_assert_la_LIBADD += -lpthread +endif diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c index 9d44740d4e3c..5b12c14acd6e 100644 --- a/lib/libspl/assert.c +++ b/lib/libspl/assert.c @@ -22,8 +22,96 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2024, Rob Norris + */ #include +#include + +#if defined(__linux__) +#include +#include +#ifdef HAVE_GETTID +#define libspl_gettid() gettid() +#else +#include +#define libspl_gettid() ((pid_t)syscall(__NR_gettid)) +#endif +#define libspl_getprogname() (program_invocation_short_name) +#define libspl_getthreadname(buf, len) \ + prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0) +#elif defined(__FreeBSD__) || defined(__APPLE__) +#if !defined(__APPLE__) +#include +#define libspl_gettid() pthread_getthreadid_np() +#endif +#define libspl_getprogname() getprogname() +#define libspl_getthreadname(buf, len) \ + pthread_getname_np(pthread_self(), buf, len); +#endif + +#if defined(HAVE_LIBUNWIND) +#define UNW_LOCAL_ONLY +#include + +static inline void +libspl_dump_backtrace(void) +{ + unw_context_t uc; + unw_cursor_t cp; + unw_word_t ip, off; + char funcname[128]; +#ifdef HAVE_LIBUNWIND_ELF + char objname[128]; + unw_word_t objoff; +#endif + + fprintf(stderr, "Call trace:\n"); + unw_getcontext(&uc); + unw_init_local(&cp, &uc); + while (unw_step(&cp) > 0) { + unw_get_reg(&cp, UNW_REG_IP, &ip); + unw_get_proc_name(&cp, funcname, sizeof (funcname), &off); +#ifdef HAVE_LIBUNWIND_ELF + unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff); + fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n", + ip, funcname, off, objname, objoff); +#else + fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off); +#endif + } +} +#elif defined(HAVE_BACKTRACE) +#include + +static inline void +libspl_dump_backtrace(void) +{ + void *btptrs[100]; + size_t nptrs = backtrace(btptrs, 100); + char **bt = backtrace_symbols(btptrs, nptrs); + fprintf(stderr, "Call trace:\n"); + for (size_t i = 0; i < nptrs; i++) + fprintf(stderr, " %s\n", bt[i]); + free(bt); +} +#else +#define libspl_dump_backtrace() +#endif + +#if defined(__APPLE__) +static inline uint64_t +libspl_gettid(void) +{ + uint64_t tid; + + if (pthread_threadid_np(NULL, &tid) != 0) + tid = 0; + + return (tid); +} +#endif static boolean_t libspl_assert_ok = B_FALSE; @@ -33,21 +121,41 @@ libspl_set_assert_ok(boolean_t val) libspl_assert_ok = val; } +static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER; + /* printf version of libspl_assert */ void libspl_assertf(const char *file, const char *func, int line, const char *format, ...) { + pthread_mutex_lock(&assert_lock); + va_list args; + char tname[64]; + + libspl_getthreadname(tname, sizeof (tname)); + + fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func); va_start(args, format); vfprintf(stderr, format, args); - fprintf(stderr, "\n"); - fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); va_end(args); + fprintf(stderr, "\n" + " PID: %-8u COMM: %s\n" +#if defined(__APPLE__) + " TID: %-8" PRIu64 " NAME: %s\n", +#else + " TID: %-8u NAME: %s\n", +#endif + getpid(), libspl_getprogname(), + libspl_gettid(), tname); + + libspl_dump_backtrace(); + #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__) if (libspl_assert_ok) { + pthread_mutex_unlock(&assert_lock); return; } #endif diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index 57f5719c1ac1..155bbab3020a 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -70,6 +70,15 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) #define VERIFY(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) + +#define VERIFYF(cond, STR, ...) \ +do { \ + if (!(cond)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s " STR, #cond, \ + __VA_ARGS__); \ +} while (0) + #define verify(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) @@ -132,6 +141,79 @@ do { \ (void *)__left); \ } while (0) +/* + * This is just here because cstyle gets upset about #LEFT + * on a newline. + */ + +/* BEGIN CSTYLED */ +#define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const boolean_t __left = (boolean_t)(LEFT); \ + const boolean_t __right = (boolean_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3SF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const int64_t __left = (int64_t)(LEFT); \ + const int64_t __right = (int64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3UF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + const uint64_t __right = (uint64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) + +#define VERIFY3PF(LEFT, OP, RIGHT, STR, ...) \ +do { \ + const uintptr_t __left = (uintptr_t)(LEFT); \ + const uintptr_t __right = (uintptr_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx) " STR, \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right, \ + __VA_ARGS__); \ +} while (0) +/* END CSTYLED */ + +#define VERIFY0F(LEFT, STR, ...) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + if (!(__left == 0)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s == 0 (0x%llx == 0) " STR, #LEFT, \ + (u_longlong_t)__left, __VA_ARGS__); \ +} while (0) + +#define VERIFY0PF(LEFT, STR, ...) \ +do { \ + const uintptr_t __left = (uintptr_t)(LEFT); \ + if (!(__left == 0)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s == 0 (%p == 0) " STR, #LEFT, \ + (u_longlong_t)__left, __VA_ARGS__); \ +} while (0) + #ifdef assert #undef assert #endif @@ -147,7 +229,15 @@ do { \ ((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z))) #define ASSERT0(x) ((void) sizeof ((uintptr_t)(x))) #define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT3BF(x, y, z, str, ...) ASSERT3B(x, y, z) +#define ASSERT3SF(x, y, z, str, ...) ASSERT3S(x, y, z) +#define ASSERT3UF(x, y, z, str, ...) ASSERT3U(x, y, z) +#define ASSERT3PF(x, y, z, str, ...) ASSERT3P(x, y, z) +#define ASSERT0P(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERT0PF(x, str, ...) ASSERT0P(x) +#define ASSERT0F(x, str, ...) ASSERT0(x) #define ASSERT(x) ((void) sizeof ((uintptr_t)(x))) +#define ASSERTF(x, str, ...) ASSERT(x) #define assert(x) ((void) sizeof ((uintptr_t)(x))) #define IMPLY(A, B) \ ((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B))) @@ -160,7 +250,14 @@ do { \ #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT0P VERIFY0P +#define ASSERT3BF VERIFY3BF +#define ASSERT3SF VERIFY3SF +#define ASSERT3UF VERIFY3UF +#define ASSERT3PF VERIFY3PF +#define ASSERT0PF VERIFY0PF +#define ASSERT0F VERIFY0F #define ASSERT VERIFY +#define ASSERTF VERIFYF #define assert VERIFY #define IMPLY(A, B) \ ((void)(((!(A)) || (B)) || \ diff --git a/lib/libuutil/uu_list.c b/lib/libuutil/uu_list.c index 0ca6f05205e9..aa8b129cc22a 100644 --- a/lib/libuutil/uu_list.c +++ b/lib/libuutil/uu_list.c @@ -505,14 +505,20 @@ uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags) } if (lp->ul_debug || robust) { - uu_list_walk_t my_walk; + uu_list_walk_t *my_walk; void *e; - list_walk_init(&my_walk, lp, flags); + my_walk = uu_zalloc(sizeof (*my_walk)); + if (my_walk == NULL) + return (-1); + + list_walk_init(my_walk, lp, flags); while (status == UU_WALK_NEXT && - (e = uu_list_walk_next(&my_walk)) != NULL) + (e = uu_list_walk_next(my_walk)) != NULL) status = (*func)(e, private); - list_walk_fini(&my_walk); + list_walk_fini(my_walk); + + uu_free(my_walk); } else { if (!reverse) { for (np = lp->ul_null_node.uln_next; diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c index 9989fa1eb80f..12144dc65e75 100644 --- a/lib/libzdb/libzdb.c +++ b/lib/libzdb/libzdb.c @@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg) * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, * it's possible the offsets are equal. In that case, sort by txg */ - if (l->blk_birth < r->blk_birth) { + if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) { return (-1); - } else if (l->blk_birth > r->blk_birth) { + } else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) { return (+1); } return (0); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index cdd2f04c2629..2bbaae6345ab 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1112,14 +1112,11 @@ - + - - - - + @@ -2832,6 +2829,9 @@ + + + @@ -2844,6 +2844,10 @@ + + + + @@ -2964,6 +2968,24 @@ + + + + + + + + + + + + + + + + + + @@ -2979,6 +3001,7 @@ + @@ -3214,9 +3237,13 @@ + + + + @@ -3249,6 +3276,7 @@ + @@ -3353,6 +3381,10 @@ + + + + @@ -3436,8 +3468,9 @@ - + + @@ -3794,12 +3827,18 @@ + + + + + + @@ -4232,9 +4271,13 @@ - - - + + + + + + + @@ -4258,9 +4301,13 @@ - - - + + + + + + + @@ -6315,6 +6362,7 @@ + @@ -6778,7 +6826,7 @@ - + @@ -6786,7 +6834,7 @@ - + diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 6f8773aed425..231bbbd92dbf 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5565,8 +5565,21 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. + * + * Bitshift is to avoid the case of nblocks * asize < tsize + * producing a size of 0. + */ + volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT); + /* + * If we would blow UINT64_MAX with this next multiplication, + * don't. */ - volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; + if (volsize > + (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT))) + volsize = UINT64_MAX; + else + volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + if (volsize > ret) { ret = volsize; } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 402c14a6baee..979bbdd3809a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp) * necessary verification to ensure that the vdev specification is well-formed. */ int -zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) +zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift) { zfs_cmd_t zc = {"\0"}; int ret; @@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) zcmd_write_conf_nvlist(hdl, &zc, nvroot); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_flags = check_ashift; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { @@ -1899,7 +1900,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " @@ -1961,7 +1963,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index d7b90ccb1cba..526f57ea403c 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1053,6 +1053,7 @@ send_progress_thread(void *arg) } } pthread_cleanup_pop(B_TRUE); + return (NULL); } static boolean_t diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 8e70af2e5830..73ae0950ccb6 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. * Copyright (c) 2020 The FreeBSD Foundation @@ -319,6 +319,9 @@ libzfs_error_description(libzfs_handle_t *hdl) "dataset without force")); case EZFS_RAIDZ_EXPAND_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "raidz expansion in progress")); + case EZFS_ASHIFT_MISMATCH: + return (dgettext(TEXT_DOMAIN, "adding devices with " + "different physical sector sizes is not allowed")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -768,6 +771,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap); break; + case ZFS_ERR_ASHIFT_MISMATCH: + zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index ffad7fc02bc9..a3930ee07f73 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -92,7 +92,8 @@ zk_thread_wrapper(void *arg) } kthread_t * -zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) +zk_thread_create(const char *name, void (*func)(void *), void *arg, + size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; @@ -140,6 +141,8 @@ zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw)); VERIFY0(pthread_attr_destroy(&attr)); + pthread_setname_np(tid, name); + return ((void *)(uintptr_t)tid); } diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 99a181ec3c93..5fb2283cf0b1 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -295,8 +295,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, } for (t = 0; t < nthreads; t++) - VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, - taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); + VERIFY((tq->tq_threadlist[t] = thread_create_named(tq->tq_name, + NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); return (tq); } diff --git a/man/Makefile.am b/man/Makefile.am index 45156571eec3..43bb014ddd32 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -62,7 +62,6 @@ dist_man_MANS = \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ - %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ @@ -115,7 +114,8 @@ endif nodist_man_MANS = \ %D%/man8/zed.8 \ - %D%/man8/zfs-mount-generator.8 + %D%/man8/zfs-mount-generator.8 \ + %D%/man8/zfs_prepare_disk.8 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 30c168253f96..6895a2a6d79f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2,6 +2,7 @@ .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. +.\" Copyright (c) 2023, 2024 Klara, Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -15,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd July 21, 2023 +.Dd February 14, 2024 .Dt ZFS 4 .Os . @@ -244,12 +245,25 @@ For blocks that could be forced to be a gang block (due to .Sy metaslab_force_ganging ) , force this many of them to be gang blocks. . -.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int +Controls prefetching BRT records for blocks which are going to be cloned. +. +.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int +Default BRT ZAP data block size as a power of 2. Note that changing this after +creating a BRT on the pool will not affect existing BRTs, only newly created +ones. +. +.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int +Default BRT ZAP indirect block size as a power of 2. Note that changing this +after creating a BRT on the pool will not affect existing BRTs, only newly +created ones. +. +.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . -.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP indirect block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. @@ -511,10 +525,17 @@ most ZPL operations (e.g. write, create) will return . .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int Determines the number of block alloctators to use per spa instance. -Capped by the number of actual CPUs in the system. +Capped by the number of actual CPUs in the system via +.Sy spa_cpus_per_allocator . .Pp Note that setting this value too high could result in performance degredation and/or excess fragmentation. +Set value only applies to pools imported/created after that. +. +.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int +Determines the minimum number of CPUs in a system for block alloctator +per spa instance. +Set value only applies to pools imported/created after that. . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the @@ -550,6 +571,9 @@ However, this is limited by Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . +.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +If set, adjacent empty ZAP blocks will be collapsed, reducing disk space. +. .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to @@ -564,6 +588,13 @@ Max bytes to prefetch per stream. .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . +.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint +Requests within this byte distance from the current prefetch stream position +are considered parts of the stream, reordered due to parallel processing. +Such requests do not advance the stream position immediately unless +.Sy zfetch_hole_shift +fill threshold is reached, but saved to fill holes in the stream later. +. .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint Max number of streams per zfetch (prefetch streams per file). . @@ -1362,6 +1393,29 @@ _ 4 Driver No driver retries on driver errors. .TE . +.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint +Maximum number of segments to add to a BIO (min 4). +If this is higher than the maximum allowed by the device queue or the kernel +itself, it will be clamped. +Setting it to zero will cause the kernel's ideal size to be used. +This parameter only applies on Linux. +This parameter is ignored if +.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . +. +.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint +If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 +and earlier. +This "classic" method has known issues with highly fragmented IO requests and +is slower on many workloads, but it has been in use for many years and is known +to be very stable. +If you set this parameter, please also open a bug report why you did so, +including the workload involved and any error messages. +.Pp +This parameter and the classic submission method will be removed once we have +total confidence in the new method. +.Pp +This parameter only applies on Linux, and can only be set at module load time. +. .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . @@ -2279,8 +2333,8 @@ Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. -These workers are responsible for I/O work such as compression and -checksum calculations. +These workers are responsible for I/O work such as compression, encryption, +checksum and parity calculations. Fractional number of CPUs will be rounded down. .Pp The default value of @@ -2288,33 +2342,36 @@ The default value of was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. -Lower values improve I/O ordering and CPU utilization, -while higher reduces lock contention. +Higher values improve I/O ordering and CPU utilization, +while lower reduce lock contention. +Set value only applies to pools imported/created after that. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. +Set value only applies to pools imported/created after that. . -.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint -Determines the number of CPUs to run write issue taskqs. -.Pp -When 0 (the default), the value to use is computed internally -as the number of actual CPUs in the system divided by the -.Sy spa_num_allocators -value. +.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint +Determines the minumum number of threads per write issue taskq. +Higher values improve CPU utilization on high throughput, +while lower reduce taskq locks contention on high IOPS. +Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp Set the queue and thread configuration for the IO read queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . -.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp +.It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. +Set values only apply to pools imported/created after that. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. @@ -2350,6 +2407,13 @@ The number of requests which can be handled concurrently is controlled by is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . +.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint +Number of zvol taskqs. +If +.Sy 0 +(the default) then scaling is done internally to prefer 6 threads per taskq. +This only applies on Linux. +. .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 3d3ebc072915..5ec37df179de 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used. The value of these properties do not persist across vdev replacement. For this reason, it is advisable to set the property on the top-level vdev - not on the leaf vdev itself. -The default values are 10 errors in 600 seconds. +The default values for +.Sy OpenZFS on Linux +are 10 errors in 600 seconds. +For +.Sy OpenZFS on FreeBSD +defaults see +.Xr zfsd 8 . .It Sy comment A text comment up to 8192 characters long .It Sy bootsize diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 index 35aa187cf063..20dbe4d0e648 100644 --- a/man/man8/zfs-mount.8 +++ b/man/man8/zfs-mount.8 @@ -43,7 +43,7 @@ .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Nm zfs .Cm unmount .Op Fl fu @@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted. .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Xc Mount ZFS filesystem on a path described by its .Sy mountpoint @@ -83,6 +83,8 @@ for more information. .It Fl a Mount all available ZFS file systems. Invoked automatically as part of the boot process if configured. +.It Fl R +Mount the specified filesystems along with all their children. .It Ar filesystem Mount the specified filesystem. .It Fl o Ar options diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 index c01bcc643e5d..8cc19caf3f00 100644 --- a/man/man8/zfs-set.8 +++ b/man/man8/zfs-set.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd April 20, 2024 .Dt ZFS-SET 8 .Os . @@ -158,6 +158,15 @@ A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , volume , bookmark , No or Sy all . +.Sy fs , +.Sy snap , +or +.Sy vol +can be used as aliases for +.Sy filesystem , +.Sy snapshot , +or +.Sy volume . .El .It Xo .Nm zfs diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index b692f12130a8..ad9e7a42bfac 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -19,10 +19,11 @@ .\" CDDL HEADER END .\" .\" Copyright 2013 Darik Horn . All rights reserved. +.\" Copyright (c) 2024, Klara Inc. .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" -.Dd May 26, 2021 +.Dd April 4, 2024 .Dt ZINJECT 8 .Os . @@ -128,6 +129,14 @@ Force a vdev error. . .It Xo .Nm zinject +.Fl i Ar seconds +.Ar pool +.Xc +Add an artificial delay during the future import of a pool. +This injector is automatically cleared after the import is finished. +. +.It Xo +.Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool @@ -210,9 +219,11 @@ to flip a bit in the data after a read, .It Sy dtl for an ECHILD error, .It Sy io -for an EIO error where reopening the device will succeed, or +for an EIO error where reopening the device will succeed, .It Sy nxio -for an ENXIO error where reopening the device will fail. +for an ENXIO error where reopening the device will fail, or +.It Sy noop +to drop the IO without executing it, and return success. .El .Pp For EIO and ENXIO, the "failed" reads or writes still occur. @@ -257,6 +268,7 @@ Run for this many seconds before reporting failure. .It Fl T Ar failure Set the failure type to one of .Sy all , +.Sy ioctl , .Sy claim , .Sy free , .Sy read , diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8 index 8ccdcccc7b06..60b35f1a511a 100644 --- a/man/man8/zpool-add.8 +++ b/man/man8/zpool-add.8 @@ -24,8 +24,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024 by Delphix. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd March 8, 2024 .Dt ZPOOL-ADD 8 .Os . @@ -36,6 +37,7 @@ .Nm zpool .Cm add .Op Fl fgLnP +.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool vdev Ns … . @@ -56,7 +58,8 @@ subcommand. .It Fl f Forces use of .Ar vdev Ns s , -even if they appear in use or specify a conflicting replication level. +even if they appear in use, have conflicting ashift values, or specify +a conflicting replication level. Not all devices can be overridden in this manner. .It Fl g Display @@ -91,6 +94,17 @@ See the manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . +.It Fl -allow-ashift-mismatch +Disable the ashift validation which allows mismatched ashift values in the +pool. +Adding top-level +.Ar vdev Ns s +with different sector sizes will prohibit future device removal operations, see +.Xr zpool-remove 8 . +.It Fl -allow-in-use +Allow vdevs to be added even if they might be in use in another pool. +.It Fl -allow-replication-mismatch +Allow vdevs with conflicting replication levels to be added to the pool. .El . .Sh EXAMPLES diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index c61ecae483ac..3e448be87fc2 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the devices can be accessed. Pools with .Sy multihost -enabled which have been suspended cannot be resumed. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. +enabled which have been suspended cannot be resumed when there is evidence +that the pool was imported by another host. +The same checks performed during an import will be applied before the clear +proceeds. .Bl -tag -width Ds .It Fl -power Power on the devices's slot in the storage enclosure and wait for the device diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index e1436f6ded57..ef20ef4e003c 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -25,8 +25,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara Inc. .\" -.Dd July 11, 2023 +.Dd February 28, 2024 .Dt ZPOOL-EVENTS 8 .Os . @@ -363,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data. .Sh I/O STAGES The ZFS I/O pipeline is comprised of various stages which are defined below. The individual stages are used to construct these basic I/O -operations: Read, Write, Free, Claim, and Ioctl. +operations: Read, Write, Free, Claim, Flush and Trim. These stages may be set on an event to describe the life cycle of a given I/O request. .Pp @@ -372,43 +373,43 @@ tab(:); l l l . Stage:Bit Mask:Operations _:_:_ -ZIO_STAGE_OPEN:0x00000001:RWFCI +ZIO_STAGE_OPEN:0x00000001:RWFCXT -ZIO_STAGE_READ_BP_INIT:0x00000002:R---- -ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W--- -ZIO_STAGE_FREE_BP_INIT:0x00000008:--F-- -ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF-- -ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W--- +ZIO_STAGE_READ_BP_INIT:0x00000002:R----- +ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---- +ZIO_STAGE_FREE_BP_INIT:0x00000008:--F--- +ZIO_STAGE_ISSUE_ASYNC:0x00000010:-WF--T +ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W---- -ZIO_STAGE_ENCRYPT:0x00000040:-W--- -ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- +ZIO_STAGE_ENCRYPT:0x00000040:-W---- +ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---- -ZIO_STAGE_NOP_WRITE:0x00000100:-W--- +ZIO_STAGE_NOP_WRITE:0x00000100:-W---- -ZIO_STAGE_BRT_FREE:0x00000200:--F-- +ZIO_STAGE_BRT_FREE:0x00000200:--F--- -ZIO_STAGE_DDT_READ_START:0x00000400:R---- -ZIO_STAGE_DDT_READ_DONE:0x00000800:R---- -ZIO_STAGE_DDT_WRITE:0x00001000:-W--- -ZIO_STAGE_DDT_FREE:0x00002000:--F-- +ZIO_STAGE_DDT_READ_START:0x00000400:R----- +ZIO_STAGE_DDT_READ_DONE:0x00000800:R----- +ZIO_STAGE_DDT_WRITE:0x00001000:-W---- +ZIO_STAGE_DDT_FREE:0x00002000:--F--- -ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC- -ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC- +ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-- +ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-- -ZIO_STAGE_DVA_THROTTLE:0x00010000:-W--- -ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W--- -ZIO_STAGE_DVA_FREE:0x00040000:--F-- -ZIO_STAGE_DVA_CLAIM:0x00080000:---C- +ZIO_STAGE_DVA_THROTTLE:0x00010000:-W---- +ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W---- +ZIO_STAGE_DVA_FREE:0x00040000:--F--- +ZIO_STAGE_DVA_CLAIM:0x00080000:---C-- -ZIO_STAGE_READY:0x00100000:RWFCI +ZIO_STAGE_READY:0x00100000:RWFCIT -ZIO_STAGE_VDEV_IO_START:0x00200000:RW--I -ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--I -ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--I +ZIO_STAGE_VDEV_IO_START:0x00200000:RW--XT +ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT +ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT -ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R---- +ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----- -ZIO_STAGE_DONE:0x02000000:RWFCI +ZIO_STAGE_DONE:0x02000000:RWFCXT .TE . .Sh I/O FLAGS diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 24ad6e643cae..bbe7a45aa0c6 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DeigLpPstvx +.Op Fl DegiLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,14 +69,20 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl D +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. .It Fl e Only show unhealthy vdevs (not-ONLINE or with errors). -.It Fl i -Display vdev initialization status. .It Fl g Display vdev GUIDs instead of the normal device names These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. +.It Fl i +Display vdev initialization status. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the @@ -90,12 +96,6 @@ the path. This can be used in conjunction with the .Fl L flag. -.It Fl D -Display a histogram of deduplication statistics, showing the allocated -.Pq physically present on disk -and referenced -.Pq logically referenced in the pool -block counts and sizes by reference count. .It Fl s Display the number of leaf vdev slow I/O operations. This is the number of I/O operations that didn't complete in diff --git a/module/Makefile.bsd b/module/Makefile.bsd index e9ad69fc50a2..d9d31564d090 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -82,12 +82,9 @@ CFLAGS+= -DBITS_PER_LONG=64 SRCS= vnode_if.h device_if.h bus_if.h -# avl +#avl SRCS+= avl.c -# icp -SRCS+= edonr.c - #icp/algs/blake3 SRCS+= blake3.c \ blake3_generic.c \ @@ -107,9 +104,12 @@ SRCS+= blake3_avx2.S \ blake3_sse2.S \ blake3_sse41.S +#icp/algs/edonr +SRCS+= edonr.c + #icp/algs/sha2 -SRCS+= sha2_generic.c \ - sha256_impl.c \ +SRCS+= sha256_impl.c \ + sha2_generic.c \ sha512_impl.c #icp/asm-arm/sha2 @@ -122,8 +122,8 @@ SRCS+= sha256-armv8.S \ #icp/asm-ppc64/sha2 SRCS+= sha256-p8.S \ - sha512-p8.S \ sha256-ppc.S \ + sha512-p8.S \ sha512-ppc.S #icp/asm-x86_64/sha2 @@ -157,10 +157,10 @@ SRCS+= lapi.c \ lzio.c #nvpair -SRCS+= nvpair.c \ - fnvpair.c \ - nvpair_alloc_spl.c \ - nvpair_alloc_fixed.c +SRCS+= fnvpair.c \ + nvpair.c \ + nvpair_alloc_fixed.c \ + nvpair_alloc_spl.c #os/freebsd/spl SRCS+= acl_common.c \ @@ -184,7 +184,6 @@ SRCS+= acl_common.c \ spl_zlib.c \ spl_zone.c - .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ ${MACHINE_ARCH} == "powerpcspe" || ${MACHINE_ARCH} == "arm" SRCS+= spl_atomic.c @@ -207,6 +206,7 @@ SRCS+= abd_os.c \ zfs_ctldir.c \ zfs_debug.c \ zfs_dir.c \ + zfs_file_os.c \ zfs_ioctl_compat.c \ zfs_ioctl_os.c \ zfs_racct.c \ @@ -217,19 +217,20 @@ SRCS+= abd_os.c \ zvol_os.c #unicode -SRCS+= uconv.c \ - u8_textprep.c +SRCS+= u8_textprep.c \ + uconv.c #zcommon -SRCS+= zfeature_common.c \ +SRCS+= cityhash.c \ + zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ - zfs_fletcher.c \ zfs_fletcher_avx512.c \ + zfs_fletcher.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c \ - zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ + zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ @@ -243,14 +244,13 @@ SRCS+= abd.c \ blkptr.c \ bplist.c \ bpobj.c \ + bptree.c \ + bqueue.c \ brt.c \ btree.c \ - cityhash.c \ + dataset_kstats.c \ dbuf.c \ dbuf_stats.c \ - bptree.c \ - bqueue.c \ - dataset_kstats.c \ ddt.c \ ddt_stats.c \ ddt_zap.c \ @@ -266,13 +266,13 @@ SRCS+= abd.c \ dmu_zfetch.c \ dnode.c \ dnode_sync.c \ + dsl_bookmark.c \ + dsl_crypt.c \ dsl_dataset.c \ dsl_deadlist.c \ dsl_deleg.c \ - dsl_bookmark.c \ - dsl_dir.c \ - dsl_crypt.c \ dsl_destroy.c \ + dsl_dir.c \ dsl_pool.c \ dsl_prop.c \ dsl_scan.c \ @@ -281,9 +281,9 @@ SRCS+= abd.c \ edonr_zfs.c \ fm.c \ gzip.c \ - lzjb.c \ lz4.c \ lz4_zfs.c \ + lzjb.c \ metaslab.c \ mmp.c \ multilist.c \ @@ -296,6 +296,8 @@ SRCS+= abd.c \ sha2_zfs.c \ skein_zfs.c \ spa.c \ + space_map.c \ + space_reftree.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ @@ -303,16 +305,14 @@ SRCS+= abd.c \ spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ - space_map.c \ - space_reftree.c \ txg.c \ uberblock.c \ unique.c \ vdev.c \ vdev_draid.c \ vdev_draid_rand.c \ - vdev_indirect.c \ vdev_indirect_births.c \ + vdev_indirect.c \ vdev_indirect_mapping.c \ vdev_initialize.c \ vdev_label.c \ @@ -320,11 +320,11 @@ SRCS+= abd.c \ vdev_missing.c \ vdev_queue.c \ vdev_raidz.c \ - vdev_raidz_math.c \ - vdev_raidz_math_scalar.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_rebuild.c \ @@ -343,7 +343,6 @@ SRCS+= abd.c \ zfeature.c \ zfs_byteswap.c \ zfs_chksum.c \ - zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ zfs_impl.c \ @@ -367,30 +366,36 @@ SRCS+= abd.c \ zvol.c #zstd -SRCS+= zfs_zstd.c \ - entropy_common.c \ +SRCS+= zfs_zstd.c + +#zstd/common +SRCS+= entropy_common.c \ error_private.c \ - fse_compress.c \ fse_decompress.c \ - hist.c \ - huf_compress.c \ - huf_decompress.c \ pool.c \ xxhash.c \ zstd_common.c \ + +#zstd/compress +SRCS+= fse_compress.c \ + hist.c \ + huf_compress.c \ zstd_compress.c \ zstd_compress_literals.c \ zstd_compress_sequences.c \ zstd_compress_superblock.c \ - zstd_ddict.c \ - zstd_decompress.c \ - zstd_decompress_block.c \ zstd_double_fast.c \ zstd_fast.c \ zstd_lazy.c \ zstd_ldm.c \ zstd_opt.c +#zstd/decompress +SRCS+= huf_decompress.c \ + zstd_ddict.c \ + zstd_decompress_block.c \ + zstd_decompress.c + beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index dc2719d142db..fefebf08116e 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if !defined(LD_VERSION) || LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -555,7 +563,7 @@ compress_pre: zfs_blake3_compress_xof_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2: zfs_blake3_hash_many_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE stp d15, d14, [sp, #-160]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index c4c2dfc5bcde..1ad6cefc6d06 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if !defined(LD_VERSION) || LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -565,7 +573,7 @@ compress_pre: zfs_blake3_compress_xof_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S index 7ae486e4e229..4dcdd3b65d0b 100644 --- a/module/icp/asm-aarch64/sha2/sha256-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S index 9c61eeee4d7b..f6c8f7742912 100644 --- a/module/icp/asm-aarch64/sha2/sha512-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index d9449e47e87a..887f7d32df4a 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 58a37df62b69..3b812271f98b 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; } /* diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index a65dfec86caf..869093afa3ed 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -247,7 +247,7 @@ vdev_file_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); @@ -255,14 +255,7 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC|O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); zio_execute(zio); return; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 2df13e7e187e..38c1d8e9e464 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1053,7 +1053,7 @@ vdev_geom_io_intr(struct bio *bp) /* * We have to split bio freeing into two parts, because the ABD code * cannot be called in this context and vdev_op_io_done is not called - * for ZIO_TYPE_IOCTL zio-s. + * for ZIO_TYPE_FLUSH zio-s. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { g_destroy_bio(bp); @@ -1153,46 +1153,35 @@ vdev_geom_io_start(zio_t *zio) vd = zio->io_vd; - switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; - } else { - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || - vdev_geom_bio_flush_disable) - break; - if (vd->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - goto sendreq; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } } - zio_execute(zio); - return; - case ZIO_TYPE_TRIM: - if (!vdev_geom_bio_delete_disable) { - goto sendreq; + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) { + zio_execute(zio); + return; + } + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + } else if (zio->io_type == ZIO_TYPE_TRIM) { + if (vdev_geom_bio_delete_disable) { + zio_execute(zio); + return; } - zio_execute(zio); - return; - default: - ; - /* PASSTHROUGH --- placate compiler */ } -sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || - zio->io_type == ZIO_TYPE_IOCTL); + zio->io_type == ZIO_TYPE_FLUSH); cp = vd->vdev_tsd; if (cp == NULL) { @@ -1244,7 +1233,7 @@ vdev_geom_io_start(zio_t *zio) bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; - case ZIO_TYPE_IOCTL: + case ZIO_TYPE_FLUSH: bp->bio_cmd = BIO_FLUSH; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 6a7c2d2811b1..712ff1b837d7 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1259,7 +1259,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* Move to a new hashtable entry. */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index c384b7b378c3..e7b812c3b5b5 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -158,7 +158,7 @@ task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) * throttling the task dispatch rate. */ spin_unlock_irqrestore(&tq->tq_lock, *irqflags); - schedule_timeout(HZ / 100); + schedule_timeout_interruptible(HZ / 100); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (count < 100) { diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c index 6b77524181db..e1773da5d173 100644 --- a/module/os/linux/spl/spl-xdr.c +++ b/module/os/linux/spl/spl-xdr.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /* diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 24390fbbf125..cee7410c8833 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -59,7 +60,9 @@ #include #ifdef _KERNEL #include +#include #include +#include #endif #ifdef _KERNEL @@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { + /* + * Ensure that last chunk is not in use. abd_iterate_*() must clear + * this state (directly or abd_iter_unmap()) before advancing. + */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); + ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0(aiter->iter_page_doff); + ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) @@ -1009,6 +1015,134 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) + +/* + * This is abd_iter_page(), the function underneath abd_iterate_page_func(). + * It yields the next page struct and data offset and size within it, without + * mapping it into the address space. + */ + +/* + * "Compound pages" are a group of pages that can be referenced from a single + * struct page *. Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we + * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a + * great many of the IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGESIZE pages, and can be safely used + * as-is. However, the head page has length covering itself and all the tail + * pages. If the ABD chunk spans multiple pages, then we can use the head page + * and a >PAGESIZE length, which is far more efficient. + * + * Before kernel 4.5 however, compound page heads were refcounted separately + * from tail pages, such that moving back to the head page would require us to + * take a reference to it and releasing it once we're completely finished with + * it. In practice, that means when our caller is done with the ABD, which we + * have no insight into from here. Rather than contort this API to track head + * page references on such ancient kernels, we disable this special compound + * page handling on 4.5, instead just using treating each page within it as a + * regular PAGESIZE page (which it is). This is slightly less efficient, but + * makes everything far simpler. + * + * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the + * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to + * understand compound pages, or not, as required. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define ABD_ITER_COMPOUND_PAGES 1 +#define ABD_ITER_PAGE_SIZE(page) \ + (PageCompound(page) ? page_size(page) : PAGESIZE) +#else +#undef ABD_ITER_COMPOUND_PAGES +#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) +#endif + +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + /* + * Find the page, and the start of the data within it. This is computed + * differently for linear and scatter ABDs; linear is referenced by + * virtual memory location, while scatter is referenced by page + * pointer. + */ + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = nth_page(sg_page(aiter->iter_sg), + aiter->iter_offset >> PAGE_SHIFT); + + /* position within page */ + doff = aiter->iter_offset & (PAGESIZE - 1); + } + +#ifdef ABD_ITER_COMPOUND_PAGES + if (PageTail(page)) { + /* + * If this is a compound tail page, move back to the head, and + * adjust the offset to match. This may let us yield a much + * larger amount of data from a single logical page, and so + * leave our caller with fewer pages to process. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } +#endif + + ASSERT(page); + + /* + * Compute the maximum amount of data we can take from this page. This + * is the smaller of: + * - the remaining space in the page + * - the remaining space in this scatterlist entry (which may not cover + * the entire page) + * - the remaining space in the abd (which may not cover the entire + * scatterlist entry) + */ + dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, + aiter->iter_abd->abd_size - aiter->iter_pos); + if (!abd_is_linear(aiter->iter_abd)) + dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); + ASSERT3U(dsize, >, 0); + + /* final iterator outputs */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + aiter->iter_page_dsize = dsize; +} + +/* + * Note: ABD BIO functions only needed to support vdev_classic. See comments in + * vdev_disk.c. + */ + /* * bio_nr_pages for ABD. * @off is the offset in @abd @@ -1163,4 +1297,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index b0bda5fa2012..7284b922b3bf 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -24,6 +24,7 @@ * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -44,15 +45,25 @@ /* * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying * block_device. Since it carries the block_device inside, its convenient to - * just use the handle as a proxy. For pre-6.8, we just emulate this with - * a cast, since we don't need any of the other fields inside the handle. + * just use the handle as a proxy. + * + * Linux 6.9.x uses a file for the same purpose. + * + * For pre-6.8, we just emulate this with a cast, since we don't need any of + * the other fields inside the handle. */ -#ifdef HAVE_BDEV_OPEN_BY_PATH +#if defined(HAVE_BDEV_OPEN_BY_PATH) typedef struct bdev_handle zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((bdh)->bdev) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) +#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) +typedef struct file zfs_bdev_handle_t; +#define BDH_BDEV(bdh) (file_bdev(bdh)) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) #else typedef void zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((struct block_device *)bdh) @@ -66,6 +77,13 @@ typedef struct vdev_disk { krwlock_t vd_lock; } vdev_disk_t; +/* + * Maximum number of segments to add to a bio (min 4). If this is higher than + * the maximum allowed by the device queue or the kernel itself, it will be + * clamped. Setting it to zero will cause the kernel's ideal size to be used. + */ +uint_t zfs_vdev_disk_max_segs = 0; + /* * Unique identifier for the exclusive vdev holder. */ @@ -83,55 +101,47 @@ static uint_t zfs_vdev_open_timeout_ms = 1000; */ #define EFI_MIN_RESV_SIZE (16 * 1024) -/* - * Virtual device vector for disks. - */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[]; /* Attached bio's */ -} dio_request_t; - /* * BIO request failfast mask. */ static unsigned int zfs_vdev_failfast_mask = 1; +/* + * Convert SPA mode flags into bdev open mode flags. + */ #ifdef HAVE_BLK_MODE_T -static blk_mode_t +typedef blk_mode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ BLK_OPEN_READ +#define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE +#define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL +#define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) #else -static fmode_t +typedef fmode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ FMODE_READ +#define VDEV_BDEV_MODE_WRITE FMODE_WRITE +#define VDEV_BDEV_MODE_EXCL FMODE_EXCL +#define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) #endif -vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) -{ -#ifdef HAVE_BLK_MODE_T - blk_mode_t mode = 0; - - if (spa_mode & SPA_MODE_READ) - mode |= BLK_OPEN_READ; - if (spa_mode & SPA_MODE_WRITE) - mode |= BLK_OPEN_WRITE; +static vdev_bdev_mode_t +vdev_bdev_mode(spa_mode_t smode) +{ + ASSERT3U(smode, !=, SPA_MODE_UNINIT); + ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); - if (exclusive) - mode |= BLK_OPEN_EXCL; -#else - fmode_t mode = 0; + vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; - if (spa_mode & SPA_MODE_READ) - mode |= FMODE_READ; + if (smode & SPA_MODE_READ) + bmode |= VDEV_BDEV_MODE_READ; - if (spa_mode & SPA_MODE_WRITE) - mode |= FMODE_WRITE; + if (smode & SPA_MODE_WRITE) + bmode |= VDEV_BDEV_MODE_WRITE; - if (exclusive) - mode |= FMODE_EXCL; -#endif + ASSERT(bmode & VDEV_BDEV_MODE_MASK); + ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); - return (mode); + return (bmode); } /* @@ -238,30 +248,32 @@ vdev_disk_kobj_evt_post(vdev_t *v) } static zfs_bdev_handle_t * -vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) +vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) { -#if defined(HAVE_BDEV_OPEN_BY_PATH) - return (bdev_open_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder, NULL)); + vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); + +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) + return (bdev_file_open_by_path(path, bmode, holder, NULL)); +#elif defined(HAVE_BDEV_OPEN_BY_PATH) + return (bdev_open_by_path(path, bmode, holder, NULL)); #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) - return (blkdev_get_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder, NULL)); + return (blkdev_get_by_path(path, bmode, holder, NULL)); #else - return (blkdev_get_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder)); + return (blkdev_get_by_path(path, bmode, holder)); #endif } static void -vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) +vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) { #if defined(HAVE_BDEV_RELEASE) return (bdev_release(bdh)); #elif defined(HAVE_BLKDEV_PUT_HOLDER) return (blkdev_put(BDH_BDEV(bdh), holder)); +#elif defined(HAVE_BLKDEV_PUT) + return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); #else - return (blkdev_put(BDH_BDEV(bdh), - vdev_bdev_mode(mode, B_TRUE))); + fput(bdh); #endif } @@ -270,11 +282,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { zfs_bdev_handle_t *bdh; -#ifdef HAVE_BLK_MODE_T - blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); -#else - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); -#endif + spa_mode_t smode = spa_mode(v->vdev_spa); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; @@ -325,16 +333,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - vdev_blkdev_put(bdh, mode, zfs_vdev_holder); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); } if (reread_part) { - bdh = vdev_blkdev_get_by_path(disk_name, mode, + bdh = vdev_blkdev_get_by_path(disk_name, smode, zfs_vdev_holder); if (!BDH_IS_ERR(bdh)) { int error = vdev_bdev_reread_part(BDH_BDEV(bdh)); - vdev_blkdev_put(bdh, mode, zfs_vdev_holder); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -379,7 +387,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, hrtime_t start = gethrtime(); bdh = BDH_ERR_PTR(-ENXIO); while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { - bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, + bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, zfs_vdev_holder); if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { /* @@ -389,7 +397,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (v->vdev_removed) break; - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible(MSEC_TO_TICK(10)); } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; @@ -421,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(bdev); - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - v->vdev_nowritecache = B_FALSE; + /* + * If the device has a write cache, clear the nowritecache flag, + * so that we start issuing flush requests again. + */ + v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(bdev); @@ -457,95 +468,15 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdh != NULL) { + if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); - } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } -static dio_request_t * -vdev_disk_dio_alloc(int bio_count) -{ - dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (int i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - - return (dr); -} - -static void -vdev_disk_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_disk_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static void -vdev_disk_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_disk_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } -} - -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) -{ - dio_request_t *dr = bio->bi_private; - - if (dr->dr_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - dr->dr_error = BIO_END_IO_ERROR(bio); -#else - if (error) - dr->dr_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - dr->dr_error = EIO; -#endif - } - - /* Drop reference acquired by __vdev_disk_physio */ - vdev_disk_dio_put(dr); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -697,8 +628,467 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, return (bio); } +static inline uint_t +vdev_bio_max_segs(struct block_device *bdev) +{ + /* + * Smallest of the device max segs and the tuneable max segs. Minimum + * 4, so there's room to finish split pages if they come up. + */ + const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); + const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? + MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; + const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); + +#ifdef HAVE_BIO_MAX_SEGS + return (bio_max_segs(max_segs)); +#else + return (MIN(max_segs, BIO_MAX_PAGES)); +#endif +} + +static inline uint_t +vdev_bio_max_bytes(struct block_device *bdev) +{ + return (queue_max_sectors(bdev_get_queue(bdev)) << 9); +} + + +/* + * Virtual block IO object (VBIO) + * + * Linux block IO (BIO) objects have a limit on how many data segments (pages) + * they can hold. Depending on how they're allocated and structured, a large + * ZIO can require more than one BIO to be submitted to the kernel, which then + * all have to complete before we can return the completed ZIO back to ZFS. + * + * A VBIO is a wrapper around multiple BIOs, carrying everything needed to + * translate a ZIO down into the kernel block layer and back again. + * + * Note that these are only used for data ZIOs (read/write). Meta-operations + * (flush/trim) don't need multiple BIOs and so can just make the call + * directly. + */ +typedef struct { + zio_t *vbio_zio; /* parent zio */ + + struct block_device *vbio_bdev; /* blockdev to submit bios to */ + + abd_t *vbio_abd; /* abd carrying borrowed linear buf */ + + uint_t vbio_max_segs; /* max segs per bio */ + + uint_t vbio_max_bytes; /* max bytes per bio */ + uint_t vbio_lbs_mask; /* logical block size mask */ + + uint64_t vbio_offset; /* start offset of next bio */ + + struct bio *vbio_bio; /* pointer to the current bio */ + int vbio_flags; /* bio flags */ +} vbio_t; + +static vbio_t * +vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) +{ + vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); + + vbio->vbio_zio = zio; + vbio->vbio_bdev = bdev; + vbio->vbio_abd = NULL; + vbio->vbio_max_segs = vdev_bio_max_segs(bdev); + vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); + vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); + vbio->vbio_offset = zio->io_offset; + vbio->vbio_bio = NULL; + vbio->vbio_flags = flags; + + return (vbio); +} + +BIO_END_IO_PROTO(vbio_completion, bio, error); + +static int +vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) +{ + struct bio *bio = vbio->vbio_bio; + uint_t ssize; + + while (size > 0) { + if (bio == NULL) { + /* New BIO, allocate and set up */ + bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, + vbio->vbio_max_segs); + VERIFY(bio); + + BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; + bio_set_op_attrs(bio, + vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? + WRITE : READ, vbio->vbio_flags); + + if (vbio->vbio_bio) { + bio_chain(vbio->vbio_bio, bio); + vdev_submit_bio(vbio->vbio_bio); + } + vbio->vbio_bio = bio; + } + + /* + * Only load as much of the current page data as will fit in + * the space left in the BIO, respecting lbs alignment. Older + * kernels will error if we try to overfill the BIO, while + * newer ones will accept it and split the BIO. This ensures + * everything works on older kernels, and avoids an additional + * overhead on the new. + */ + ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & + vbio->vbio_lbs_mask); + if (ssize > 0 && + bio_add_page(bio, page, ssize, offset) == ssize) { + /* Accepted, adjust and load any remaining. */ + size -= ssize; + offset += ssize; + continue; + } + + /* No room, set up for a new BIO and loop */ + vbio->vbio_offset += BIO_BI_SIZE(bio); + + /* Signal new BIO allocation wanted */ + bio = NULL; + } + + return (0); +} + +/* Iterator callback to submit ABD pages to the vbio. */ +static int +vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vbio_t *vbio = priv; + return (vbio_add_page(vbio, page, len, off)); +} + +/* Create some BIOs, fill them with data and submit them */ +static void +vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) +{ + /* + * We plug so we can submit the BIOs as we go and only unplug them when + * they are fully created and submitted. This is important; if we don't + * plug, then the kernel may start executing earlier BIOs while we're + * still creating and executing later ones, and if the device goes + * away while that's happening, older kernels can get confused and + * trample memory. + */ + struct blk_plug plug; + blk_start_plug(&plug); + + (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); + ASSERT(vbio->vbio_bio); + + vbio->vbio_bio->bi_end_io = vbio_completion; + vbio->vbio_bio->bi_private = vbio; + + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ + vdev_submit_bio(vbio->vbio_bio); + + blk_finish_plug(&plug); +} + +/* IO completion callback */ +BIO_END_IO_PROTO(vbio_completion, bio, error) +{ + vbio_t *vbio = bio->bi_private; + zio_t *zio = vbio->vbio_zio; + + ASSERT(zio); + + /* Capture and log any errors */ +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = 0; + if (error) + zio->io_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + zio->io_error = EIO; +#endif + ASSERT3U(zio->io_error, >=, 0); + + if (zio->io_error) + vdev_disk_error(zio); + + /* Return the BIO to the kernel */ + bio_put(bio); + + /* + * If we copied the ABD before issuing it, clean up and return the copy + * to the ADB, with changes if appropriate. + */ + if (vbio->vbio_abd != NULL) { + void *buf = abd_to_buf(vbio->vbio_abd); + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); + } + + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); + + /* All done, submit for processing */ + zio_delay_interrupt(zio); +} + +/* + * Iterator callback to count ABD pages and check their size & alignment. + * + * On Linux, each BIO segment can take a page pointer, and an offset+length of + * the data within that page. A page can be arbitrarily large ("compound" + * pages) but we still have to ensure the data portion is correctly sized and + * aligned to the logical block size, to ensure that if the kernel wants to + * split the BIO, the two halves will still be properly aligned. + * + * NOTE: if you change this function, change the copy in + * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test + * data there to validate the change you're making. + * + */ +typedef struct { + uint_t bmask; + uint_t npages; + uint_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + (void) page; + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = (off+len) & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +/* + * Check if we can submit the pages in this ABD to the kernel as-is. Returns + * the number of pages, or 0 if it can't be submitted like this. + */ +static boolean_t +vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +{ + vdev_disk_check_pages_t s = { + .bmask = bdev_logical_block_size(bdev)-1, + .npages = 0, + .end = 0, + }; + + if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +vdev_disk_io_rw(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + int flags = 0; + + /* + * Accessing outside the block device is never allowed. + */ + if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + (u_longlong_t)zio->io_offset, + (u_longlong_t)zio->io_size, + (u_longlong_t)i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + + if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && + v->vdev_failfast == B_TRUE) { + bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, + zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); + } + + /* + * Check alignment of the incoming ABD. If any part of it would require + * submitting a page that is not aligned to the logical block size, + * then we take a copy into a linear buffer and submit that instead. + * This should be impossible on a 512b LBS, and fairly rare on 4K, + * usually requiring abnormally-small data blocks (eg gang blocks) + * mixed into the same ABD as larger ones (eg aggregated). + */ + abd_t *abd = zio->io_abd; + if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { + void *buf; + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + + /* + * Wrap the copy in an abd_t, so we can use the same iterators + * to count and fill the vbio later. + */ + abd = abd_get_from_buf(buf, zio->io_size); + + /* + * False here would mean the borrowed copy has an invalid + * alignment too, which would mean we've somehow been passed a + * linear ABD with an interior page that has a non-zero offset + * or a size not a multiple of PAGE_SIZE. This is not possible. + * It would mean either zio_buf_alloc() or its underlying + * allocators have done something extremely strange, or our + * math in vdev_disk_check_pages() is wrong. In either case, + * something in seriously wrong and its not safe to continue. + */ + VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + } + + /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ + vbio_t *vbio = vbio_alloc(zio, bdev, flags); + if (abd != zio->io_abd) + vbio->vbio_abd = abd; + + /* Fill it with data pages and submit it to the kernel */ + vbio_submit(vbio, abd, zio->io_size); + return (0); +} + +/* ========== */ + +/* + * This is the classic, battle-tested BIO submission code. Until we're totally + * sure that the new code is safe and correct in all cases, this will remain + * available and can be enabled by setting zfs_vdev_disk_classic=1 at module + * load time. + * + * These functions have been renamed to vdev_classic_* to make it clear what + * they belong to, but their implementations are unchanged. + */ + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[]; /* Attached bio's */ +} dio_request_t; + +static dio_request_t * +vdev_classic_dio_alloc(int bio_count) +{ + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + + return (dr); +} + +static void +vdev_classic_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_classic_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static void +vdev_classic_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_classic_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } +} + +BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by vdev_classic_physio */ + vdev_classic_dio_put(dr); +} + static inline unsigned int -vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) +vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); @@ -711,9 +1101,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) +vdev_classic_physio(zio_t *zio) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + size_t io_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; + int flags = 0; + dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; @@ -736,7 +1133,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } retry: - dr = vdev_disk_dio_alloc(bio_count); + dr = vdev_classic_dio_alloc(bio_count); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { @@ -771,23 +1168,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); bio_count *= 2; goto retry; } - nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); + nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); return (SET_ERROR(ENOMEM)); } - /* Matching put called by vdev_disk_physio_completion */ - vdev_disk_dio_get(dr); + /* Matching put called by vdev_classic_physio_completion */ + vdev_classic_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); @@ -801,7 +1198,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_disk_dio_get(dr); + vdev_classic_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); @@ -815,11 +1212,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, if (dr->dr_bio_count > 1) blk_finish_plug(&plug); - vdev_disk_dio_put(dr); + vdev_classic_dio_put(dr); return (error); } +/* ========== */ + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -862,8 +1261,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ - defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) { zio_t *zio = bio->bi_private; @@ -878,62 +1275,109 @@ BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) zio_interrupt(zio); } +/* + * Wrappers for the different secure erase and discard APIs. We use async + * when available; in this case, *biop is set to the last bio in the chain. + */ static int -vdev_issue_discard_trim(zio_t *zio, unsigned long flags) +vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) { - int ret; - struct bio *bio = NULL; + *biop = NULL; + int error; -#if defined(BLKDEV_DISCARD_SECURE) - ret = - __blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + error = blkdev_issue_secure_erase(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); #else - (void) flags; - ret = - __blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#error "unsupported kernel" #endif - if (!ret && bio) { - bio->bi_private = zio; - bio->bi_end_io = vdev_disk_discard_end_io; - vdev_submit_bio(bio); - } - return (ret); + + return (error); } + +static int +vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) +{ + *biop = NULL; + int error; + +#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); +#else +#error "unsupported kernel" #endif + return (error); +} + +/* + * Entry point for TRIM ops. This calls the right wrapper for secure erase or + * discard, and then does the appropriate finishing work for error vs success + * and async vs sync. + */ static int vdev_disk_io_trim(zio_t *zio) { - unsigned long trim_flags = 0; - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - return (-blkdev_issue_secure_erase( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); -#elif defined(BLKDEV_DISCARD_SECURE) - trim_flags |= BLKDEV_DISCARD_SECURE; -#endif + int error; + struct bio *bio; + + zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; + sector_t sector = zio->io_offset >> 9; + sector_t nsects = zio->io_size >> 9; + + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); + else + error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); + + if (error != 0) + return (SET_ERROR(-error)); + + if (bio == NULL) { + /* + * This was a synchronous op that completed successfully, so + * return it to ZFS immediately. + */ + zio_interrupt(zio); + } else { + /* + * This was an asynchronous op; set up completion callback and + * issue it. + */ + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); } -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ - defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) - return (vdev_issue_discard_trim(zio, trim_flags)); -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) - return (-blkdev_issue_discard( - BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); -#else -#error "Unsupported kernel" -#endif + + return (0); } +int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; + static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - int rw, error; + int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. @@ -959,79 +1403,72 @@ vdev_disk_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + case ZIO_TYPE_FLUSH: if (!vdev_readable(v)) { - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (v->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - + /* Drive not there, can't flush */ + error = SET_ERROR(ENXIO); + } else if (zfs_nocacheflush) { + /* Flushing disabled by operator, declare success */ + error = 0; + } else if (v->vdev_nowritecache) { + /* This vdev not capable of flushing */ + error = SET_ERROR(ENOTSUP); + } else { + /* + * Issue the flush. If successful, the response will + * be handled in the completion callback, so we're done. + */ error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); } + /* Couldn't issue the flush, so set the error and return it */ rw_exit(&vd->vd_lock); + zio->io_error = error; zio_execute(zio); return; - case ZIO_TYPE_WRITE: - rw = WRITE; - break; - - case ZIO_TYPE_READ: - rw = READ; - break; case ZIO_TYPE_TRIM: - zio->io_error = vdev_disk_io_trim(zio); + error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) + if (error) { + zio->io_error = error; + zio_execute(zio); + } + return; + + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = vdev_disk_io_rw_fn(zio); + rw_exit(&vd->vd_lock); + if (error) { + zio->io_error = error; zio_interrupt(zio); -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) - zio_interrupt(zio); -#endif + } return; default: + /* + * Getting here means our parent vdev has made a very strange + * request of us, and shouldn't happen. Assert here to force a + * crash in dev builds, but in production return the IO + * unhandled. The pool will likely suspend anyway but that's + * nicer than crashing the kernel. + */ + ASSERT3S(zio->io_type, ==, -1); + rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } - zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio, - zio->io_size, zio->io_offset, rw, 0); - rw_exit(&vd->vd_lock); - - if (error) { - zio->io_error = error; - zio_interrupt(zio); - return; - } + __builtin_unreachable(); } static void @@ -1080,8 +1517,49 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +/* + * BIO submission method. See comment above about vdev_classic. + * Set zfs_vdev_disk_classic=0 for new, =1 for classic + */ +static uint_t zfs_vdev_disk_classic = 0; /* default new */ + +/* Set submission function from module parameter */ +static int +vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) +{ + int err = param_set_uint(buf, kp); + if (err < 0) + return (SET_ERROR(err)); + + vdev_disk_io_rw_fn = + zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; + + printk(KERN_INFO "ZFS: forcing %s BIO submission\n", + zfs_vdev_disk_classic ? "classic" : "new"); + + return (0); +} + +/* + * At first use vdev use, set the submission function from the default value if + * it hasn't been set already. + */ +static int +vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + (void) nv; + (void) tsd; + + if (vdev_disk_io_rw_fn == NULL) + vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? + vdev_classic_physio : vdev_disk_io_rw; + + return (0); +} + vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, + .vdev_op_init = vdev_disk_init, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, @@ -1174,3 +1652,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, + "Maximum number of data segments to add to an IO request (min 4)"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, + vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, + "Use classic BIO submission method"); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 5abc0426d1a7..ac41a2615f16 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); @@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC | O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); + if (zfs_nocacheflush) { + zio_execute(zio); + return; } + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); + zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index a32307c39331..1cecad9f7755 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -3795,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - err = dmu_tx_assign(tx, TXG_NOWAIT); + err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { - if (err == ERESTART) - dmu_tx_wait(tx); - dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 3caa0fc6c214..9dec52215c7c 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; + int ret; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); - (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); + ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); - return (0); + return (ret); } #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) { - (void) zpl_putpage(&pp->page, wbc, data); - return (0); + return (zpl_putpage(&pp->page, wbc, data)); } #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 3065d54fa9da..64728fdb1187 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -26,6 +26,9 @@ #include #endif #include +#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE +#include +#endif #include #include #include @@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, ret = zpl_clone_file_range_impl(src_file, src_off, dst_file, dst_off, len); -#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE +#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE) /* * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. @@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, ret == -EAGAIN) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); +#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE) + /* + * Since 6.8 the fallback function is called splice_copy_file_range + * and has a slightly different signature. + */ + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); #else /* * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal @@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, */ if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) ret = -EOPNOTSUPP; -#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ +#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */ return (ret); } diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index ea6a2b733f82..3e020e532263 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif @@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE; static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif +static unsigned int zvol_num_taskqs = 0; + #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -114,7 +123,11 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -static taskq_t *zvol_taskq; +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; +static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -532,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, } zv_request_task_t *task; + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t blk_mq_hw_queue = 0; + uint_t tq_idx; + uint_t taskq_hash; +#ifdef HAVE_BLK_MQ + if (rq) +#ifdef HAVE_BLK_MQ_RQ_HCTX + blk_mq_hw_queue = rq->mq_hctx->queue_num; +#else + blk_mq_hw_queue = + rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; +#endif +#endif + taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue, 0); + tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { @@ -601,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { @@ -609,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_write(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } @@ -631,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_read(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } @@ -769,7 +798,8 @@ zvol_open(struct block_device *bdev, fmode_t flag) if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); + schedule_timeout_interruptible( + MSEC_TO_TICK(10)); goto retry; #endif } else { @@ -1053,6 +1083,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) if (zso->zvo_disk == NULL) return (1); + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + return (1); + } + + zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else @@ -1103,6 +1143,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv) } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + zso->zvo_disk = disk; + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { @@ -1256,7 +1307,7 @@ zvol_os_free(zvol_state_t *zv) del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ - defined(HAVE_BLK_ALLOC_DISK) + (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else @@ -1314,6 +1365,13 @@ zvol_os_create_minor(const char *name) if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; + if (MINOR(minor) != minor) { + /* too many partitions can cause an overflow */ + zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", + name, minor, MINOR(minor)); + ida_simple_remove(&zvol_ida, idx); + return (SET_ERROR(EINVAL)); + } zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { @@ -1514,7 +1572,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); @@ -1570,8 +1628,40 @@ zvol_init(void) zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } + /* + * Use atleast 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + num_online_cpus() / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + uint_t per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1591,11 +1681,22 @@ zvol_init(void) 1024); } #endif - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, - zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - unregister_blkdev(zvol_major, ZVOL_DRIVER); - return (-ENOMEM); + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (-ENOMEM); + } } zvol_init_impl(); @@ -1606,9 +1707,22 @@ zvol_init(void) void zvol_fini(void) { + zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - taskq_destroy(zvol_taskq); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } + ida_destroy(&zvol_ida); } @@ -1629,6 +1743,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); +module_param(zvol_num_taskqs, uint, 0444); +MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); + module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 0a2411a2d572..2c0cda25dbc6 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + aiter.iter_page = NULL; + aiter.iter_page_doff = 0; + aiter.iter_page_dsize = 0; + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; }; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3bcffb3c7ede..51039af9bcc0 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1014,7 +1014,7 @@ static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_PHYSICAL_BIRTH(bp); + uint64_t birth = BP_GET_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; @@ -1960,7 +1960,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf) ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); @@ -2083,7 +2083,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * allocate a new data buffer for the buf. */ if (ARC_BUF_SHARED(buf)) { - ASSERT(ARC_BUF_COMPRESSED(buf)); + ASSERTF(ARC_BUF_COMPRESSED(buf), + "buf %p was uncompressed", buf); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; @@ -2183,7 +2184,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); - spa_log_error(spa, zb, &buf->b_hdr->b_birth); + spa_log_error(spa, zb, buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } @@ -3872,7 +3873,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, ASSERT3P(marker, !=, NULL); - mls = multilist_sublist_lock(ml, idx); + mls = multilist_sublist_lock_idx(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { @@ -3984,6 +3985,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, return (bytes_evicted); } +static arc_buf_hdr_t * +arc_state_alloc_marker(void) +{ + arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_state_impl(). + */ + marker->b_spa = 0; + + return (marker); +} + +static void +arc_state_free_marker(arc_buf_hdr_t *marker) +{ + kmem_cache_free(hdr_full_cache, marker); +} + /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. @@ -3994,16 +4015,8 @@ arc_state_alloc_markers(int count) arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); - for (int i = 0; i < count; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - } + for (int i = 0; i < count; i++) + markers[i] = arc_state_alloc_marker(); return (markers); } @@ -4011,7 +4024,7 @@ static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) - kmem_cache_free(hdr_full_cache, markers[i]); + arc_state_free_marker(markers[i]); kmem_free(markers, sizeof (*markers) * count); } @@ -4055,7 +4068,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -4120,7 +4133,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, } for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -5251,7 +5264,7 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; - ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, @@ -5354,7 +5367,7 @@ arc_read_done(zio_t *zio) error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, - &zio->io_bp->blk_birth); + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); @@ -5639,7 +5652,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(spa, zb, &hdr->b_birth); + spa_log_error(spa, zb, hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); @@ -5686,12 +5699,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * embedded data. */ arc_buf_hdr_t *exists = NULL; - hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + hdr = arc_hdr_alloc(guid, psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_birth = BP_GET_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -6557,7 +6570,7 @@ arc_write_done(zio_t *zio) buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_birth = BP_GET_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); @@ -8633,7 +8646,7 @@ l2arc_sublist_lock(int list_num) * sublists being selected. */ idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); + return (multilist_sublist_lock_idx(ml, idx)); } /* @@ -8889,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; - void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); @@ -8910,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ - if (HDR_HAS_RABD(hdr) && asize != psize) { - ASSERT3U(asize, >=, psize); + if (HDR_HAS_RABD(hdr)) { + ASSERT3U(asize, >, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); + abd_zero_off(to_write, psize, asize - psize); goto out; } @@ -8924,48 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) + if (asize > size) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - /* - * In some cases, we can wind up with size > asize, so - * we need to opt for the larger allocation option here. - * - * (We also need abd_return_buf_copy in all cases because - * it's an ASSERT() to modify the buffer before returning it - * with arc_return_buf(), and all the compressors - * write things before deciding to fail compression in nearly - * every case.) - */ - uint64_t bufsize = MAX(size, asize); - cabd = abd_alloc_for_io(bufsize, ismd); - tmp = abd_borrow_buf(cabd, bufsize); - - psize = zio_compress_data(compress, to_write, &tmp, size, - hdr->b_complevel); - - if (psize >= asize) { - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); - goto encrypt; + size_t bufsize = MAX(size, asize); + void *buf = zio_buf_alloc(bufsize); + uint64_t csize = zio_compress_data(compress, to_write, &buf, + size, hdr->b_complevel); + if (csize > psize) { + /* + * We can't re-compress the block into the original + * psize. Even if it fits into asize, it does not + * matter, since checksum will never match on read. + */ + zio_buf_free(buf, bufsize); + return (SET_ERROR(EIO)); } - ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); - if (psize < asize) - memset((char *)tmp + psize, 0, bufsize - psize); - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, bufsize); - to_write = cabd; + if (asize > csize) + memset((char *)buf + csize, 0, asize - csize); + to_write = cabd = abd_get_from_buf(buf, bufsize); + abd_take_ownership_of_buf(cabd, B_TRUE); } -encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); @@ -9046,9 +9040,9 @@ l2arc_blk_fetch_done(zio_t *zio) static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; + arc_buf_hdr_t *hdr, *head, *marker; + uint64_t write_asize, write_psize, headroom; + boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); @@ -9057,10 +9051,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_lsize = write_asize = write_psize = 0; + write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + marker = arc_state_alloc_marker(); /* * Copy buffers for L2ARC writing. @@ -9075,40 +9070,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } - multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; - - VERIFY3P(mls, !=, NULL); + headroom = target_sz * l2arc_headroom; + if (zfs_compressed_arc_enabled) + headroom = (headroom * l2arc_headroom_boost) / 100; /* - * L2ARC fast warmup. - * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ - if (arc_warm == B_FALSE) + multilist_sublist_t *mls = l2arc_sublist_lock(pass); + ASSERT3P(mls, !=, NULL); + if (from_head) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); - headroom = target_sz * l2arc_headroom; - if (zfs_compressed_arc_enabled) - headroom = (headroom * l2arc_headroom_boost) / 100; - - for (; hdr; hdr = hdr_prev) { + while (hdr != NULL) { kmutex_t *hash_lock; abd_t *to_write = NULL; - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { - /* - * Skip this buffer rather than waiting. - */ +skip: + /* Skip this buffer rather than waiting. */ + if (from_head) + hdr = multilist_sublist_next(mls, hdr); + else + hdr = multilist_sublist_prev(mls, hdr); continue; } @@ -9123,11 +9112,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); - continue; + goto skip; } ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -9149,12 +9137,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. + * We should not sleep with sublist lock held or it + * may block ARC eviction. Insert a marker to save + * the position and drop the lock. */ - arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + if (from_head) { + multilist_sublist_insert_after(mls, hdr, + marker); + } else { + multilist_sublist_insert_before(mls, hdr, + marker); + } + multilist_sublist_unlock(mls); /* * If this header has b_rabd, we can use this since it @@ -9185,32 +9179,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, - ARC_FLAG_L2_WRITING); + ARC_FLAG_L2CACHE); mutex_exit(hash_lock); - continue; + goto next; } l2arc_free_abd_on_write(to_write, asize, type); } + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ - mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); + } + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + + boolean_t commit = l2arc_log_blk_insert(dev, hdr); + mutex_exit(hash_lock); + + if (pio == NULL) { cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; - /* - * Create a list to save allocated abd buffers - * for l2arc_log_blk_commit(). - */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -9218,54 +9225,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ZIO_FLAG_CANFAIL); } - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_hits = 0; - - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - hdr->b_l2hdr.b_arcs_state = - hdr->b_l1hdr.b_state->arcs_state; - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, + dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); - write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + zio_nowait(wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - mutex_exit(hash_lock); - - /* - * Append buf info to current log and commit if full. - * arcstat_l2_{size,asize} kstats are updated - * internally. - */ - if (l2arc_log_blk_insert(dev, hdr)) { - /* - * l2ad_hand will be adjusted in - * l2arc_log_blk_commit(). - */ + if (commit) { + /* l2ad_hand will be adjusted inside. */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } - zio_nowait(wzio); +next: + multilist_sublist_lock(mls); + if (from_head) + hdr = multilist_sublist_next(mls, marker); + else + hdr = multilist_sublist_prev(mls, marker); + multilist_sublist_remove(mls, marker); } multilist_sublist_unlock(mls); @@ -9274,9 +9261,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + arc_state_free_marker(marker); + /* No buffers selected for writing? */ if (pio == NULL) { - ASSERT0(write_lsize); + ASSERT0(write_psize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); @@ -10604,7 +10593,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); - L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index e772caead29b..96e1601c4e9c 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -893,7 +893,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, */ memset(&stored_bp, 0, sizeof (stored_bp)); stored_bp.blk_prop = bp->blk_prop; - stored_bp.blk_birth = bp->blk_birth; + BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp)); } else if (!BP_GET_DEDUP(bp)) { /* The bpobj will compress better without the checksum */ memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); @@ -953,7 +953,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) (void) bp_freed, (void) tx; struct space_range_arg *sra = arg; - if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && + BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) sra->used += bp_get_dsize_sync(sra->spa, bp); else @@ -985,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) /* * Return the amount of space in the bpobj which is: - * mintxg < blk_birth <= maxtxg + * mintxg < logical birth <= maxtxg */ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 225ddaca1e54..ea8c0735c4b7 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. */ -int zfs_brt_prefetch = 1; +static int brt_zap_prefetch = 1; #ifdef ZFS_DEBUG #define BRT_DEBUG(...) do { \ @@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1; #define BRT_DEBUG(...) do { } while (0) #endif -int brt_zap_leaf_blockshift = 12; -int brt_zap_indirect_blockshift = 12; +static int brt_zap_default_bs = 12; +static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; @@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, - brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, - 0, tx); + brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); VERIFY(brtvd->bv_mos_entries != 0); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); @@ -901,7 +900,6 @@ static int brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t mos_entries; - uint64_t one, physsize; int error; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); @@ -919,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) brt_unlock(brt); - error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, - BRT_KEY_WORDS, &one, &physsize); - if (error == 0) { - ASSERT3U(one, ==, 1); - ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); - - error = zap_lookup_uint64(brt->brt_mos, mos_entries, - &bre->bre_offset, BRT_KEY_WORDS, 1, - sizeof (bre->bre_refcount), &bre->bre_refcount); - BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " - "count=%llu error=%d", (u_longlong_t)mos_entries, - (u_longlong_t)brtvd->bv_vdevid, - (u_longlong_t)bre->bre_offset, - error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); - } + error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); brt_wlock(brt); @@ -955,52 +940,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) if (mos_entries == 0) return; - BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", - (u_longlong_t)mos_entries, (u_longlong_t)vdevid, - (u_longlong_t)bre->bre_offset); (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); } -static int -brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT(bre->bre_refcount > 0); - - error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, - sizeof (bre->bre_refcount), &bre->bre_refcount, tx); - BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - -static int -brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT0(bre->bre_refcount); - - error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); - BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which @@ -1405,7 +1348,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp) ASSERT(bp != NULL); - if (!zfs_brt_prefetch) + if (!brt_zap_prefetch) return; brt_entry_fill(bp, &bre, &vdevid); @@ -1420,13 +1363,13 @@ brt_pending_entry_compare(const void *x1, const void *x2) const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; int cmp; - cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); + cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), + DVA_GET_VDEV(&bp2->blk_dva[0])); if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), - DVA_GET_VDEV(&bp2->blk_dva[0])); - if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), - DVA_GET_OFFSET(&bp2->blk_dva[0])); + cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0])); + if (unlikely(cmp == 0)) { + cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); } } @@ -1471,10 +1414,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) kmem_cache_free(brt_pending_entry_cache, newbpe); } else { ASSERT(bpe == NULL); - } - /* Prefetch BRT entry, as we will need it in the syncing context. */ - brt_prefetch(brt, bp); + /* Prefetch BRT entry for the syncing context. */ + brt_prefetch(brt, bp); + } } void @@ -1514,26 +1457,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) void brt_pending_apply(spa_t *spa, uint64_t txg) { - brt_t *brt; + brt_t *brt = spa->spa_brt; brt_pending_entry_t *bpe; avl_tree_t *pending_tree; - kmutex_t *pending_lock; void *c; ASSERT3U(txg, !=, 0); - brt = spa->spa_brt; + /* + * We are in syncing context, so no other brt_pending_tree accesses + * are possible for the TXG. Don't need to acquire brt_pending_lock. + */ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - - mutex_enter(pending_lock); c = NULL; while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { boolean_t added_to_ddt; - mutex_exit(pending_lock); - for (int i = 0; i < bpe->bpe_count; i++) { /* * If the block has DEDUP bit set, it means that it @@ -1551,31 +1491,20 @@ brt_pending_apply(spa_t *spa, uint64_t txg) } kmem_cache_free(brt_pending_entry_cache, bpe); - mutex_enter(pending_lock); } - - mutex_exit(pending_lock); } static void -brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - if (bre->bre_refcount == 0) { - int error; - - error = brt_entry_remove(brt, brtvd, bre, tx); - ASSERT(error == 0 || error == ENOENT); - /* - * If error == ENOENT then zfs_clone_range() was done from a - * removed (but opened) file (open(), unlink()). - */ - ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); + int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, tx); + VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); + VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), + &bre->bre_refcount, tx)); } } @@ -1584,6 +1513,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) { brt_vdev_t *brtvd; brt_entry_t *bre; + dnode_t *dn; uint64_t vdevid; void *c; @@ -1607,14 +1537,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(brt, brtvd, tx); + VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, + FTAG, &dn)); + c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brt, brtvd, bre, tx); + brt_sync_entry(dn, bre, tx); brt_entry_free(bre); ASSERT(brt->brt_nentries > 0); brt->brt_nentries--; } + dnode_rele(dn, FTAG); + brt_vdev_sync(brt, brtvd, tx); if (brtvd->bv_totalcount == 0) @@ -1729,9 +1664,10 @@ brt_unload(spa_t *spa) } /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, - "Enable prefetching of BRT entries"); -#ifdef ZFS_BRT_DEBUG -ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); -#endif +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, + "Enable prefetching of BRT ZAP entries"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, + "BRT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, + "BRT ZAP indirect blockshift"); /* END CSTYLED */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 324bf8cbc276..bb913f556374 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -161,13 +161,13 @@ struct { } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ - wmsum_add(&dbuf_sums.stat, val); + wmsum_add(&dbuf_sums.stat, val) #define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); + DBUF_STAT_INCR(stat, -(val)) #define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); + DBUF_STAT_INCR(stat, 1) #define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); + DBUF_STAT_INCR(stat, -1) #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ @@ -177,7 +177,6 @@ struct { static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); -static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); /* * Global data structures and functions for the dbuf cache. @@ -769,7 +768,7 @@ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -1217,7 +1216,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); - ASSERT0(bp->blk_phys_birth); + ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); } } } @@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * a decrypted block. Otherwise success. */ static int -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) { - int bonuslen, max_bonuslen, err; - - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err) - return (err); + int bonuslen, max_bonuslen; bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); @@ -1457,7 +1452,7 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp) dn->dn_datablksz : BP_GET_LSIZE(dbbp)); BP_SET_TYPE(bp, BP_GET_TYPE(dbbp)); BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1); - BP_SET_BIRTH(bp, dbbp->blk_birth, 0); + BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0); } } @@ -1486,7 +1481,7 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) memset(db->db.db_data, 0, db->db.db_size); if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) && - bp->blk_birth != 0) { + BP_GET_LOGICAL_BIRTH(bp) != 0) { dbuf_handle_indirect_hole(db, dn, bp); } db->db_state = DB_CACHED; @@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { - int err = 0; objset_t *os = db->db_objset; - arc_buf_t *dnode_abuf; - dnode_t *dn; + dmu_buf_impl_t *dndb; + arc_buf_t *dnbuf; zbookmark_phys_t zb; - - ASSERT(MUTEX_HELD(&db->db_mtx)); + int err; if ((flags & DB_RF_NO_DECRYPT) != 0 || - !os->os_encrypted || os->os_raw_receive) + !os->os_encrypted || os->os_raw_receive || + (dndb = dn->dn_dbuf) == NULL) return (0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; - - if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { - DB_DNODE_EXIT(db); + dnbuf = dndb->db_buf; + if (!arc_is_encrypted(dnbuf)) return (0); - } + + mutex_enter(&dndb->db_mtx); + + /* + * Since dnode buffer is modified by sync process, there can be only + * one copy of it. It means we can not modify (decrypt) it while it + * is being written. I don't see how this may happen now, since + * encrypted dnode writes by receive should be completed before any + * plain-text reads due to txg wait, but better be safe than sorry. + */ + while (1) { + if (!arc_is_encrypted(dnbuf)) { + mutex_exit(&dndb->db_mtx); + return (0); + } + dbuf_dirty_record_t *dr = dndb->db_data_pending; + if (dr == NULL || dr->dt.dl.dr_data != dnbuf) + break; + cv_wait(&dndb->db_changed, &dndb->db_mtx); + }; SET_BOOKMARK(&zb, dmu_objset_id(os), - DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); - err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + DMU_META_DNODE_OBJECT, 0, dndb->db_blkid); + err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not @@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; - DB_DNODE_EXIT(db); + mutex_exit(&dndb->db_mtx); return (err); } @@ -1557,17 +1566,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) * returning. */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, +dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, db_lock_type_t dblt, const void *tag) { - dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp; + blkptr_t bp, *bpp = NULL; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); @@ -1576,33 +1582,32 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { - err = dbuf_read_bonus(db, dn, flags); + err = dbuf_read_bonus(db, dn); goto early_unlock; } - if (db->db_state == DB_UNCACHED) { - if (db->db_blkptr == NULL) { - bpp = NULL; - } else { - bp = *db->db_blkptr; + /* + * If we have a pending block clone, we don't want to read the + * underlying block, but the content of the block being cloned, + * pointed by the dirty record, so we have the most recent data. + * If there is no dirty record, then we hit a race in a sync + * process when the dirty record is already removed, while the + * dbuf is not yet destroyed. Such case is equivalent to uncached. + */ + if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (!dr->dt.dl.dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dr->dt.dl.dr_overridden_by; bpp = &bp; } - } else { - dbuf_dirty_record_t *dr; - - ASSERT3S(db->db_state, ==, DB_NOFILL); + } - /* - * Block cloning: If we have a pending block clone, - * we don't want to read the underlying block, but the content - * of the block being cloned, so we have the most recent data. - */ - dr = list_head(&db->db_dirty_records); - if (dr == NULL || !dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dr->dt.dl.dr_overridden_by; + if (bpp == NULL && db->db_blkptr != NULL) { + bp = *db->db_blkptr; bpp = &bp; } @@ -1633,17 +1638,12 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { - spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); + spa_log_error(db->db_objset->os_spa, &zb, + BP_GET_LOGICAL_BIRTH(bpp)); err = SET_ERROR(EIO); goto early_unlock; } - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) - goto early_unlock; - - DB_DNODE_EXIT(db); - db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); @@ -1668,12 +1668,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * parent's rwlock, which would be a lock ordering violation. */ dmu_buf_unlock_parent(db, dblt, tag); - (void) arc_read(zio, db->db_objset->os_spa, bpp, + return (arc_read(zio, db->db_objset->os_spa, bpp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, - &aflags, &zb); - return (err); + &aflags, &zb)); + early_unlock: - DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); @@ -1758,36 +1757,65 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) { - int err = 0; - boolean_t prefetch; dnode_t *dn; + boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; + int err; - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* + * Ensure that this block's dnode has been decrypted if the caller + * has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, flags); + if (err != 0) + goto done; + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL; + (flags & DB_RF_NOPREFETCH) == 0; mutex_enter(&db->db_mtx); if (flags & DB_RF_PARTIAL_FIRST) db->db_partial_read = B_TRUE; else if (!(flags & DB_RF_PARTIAL_MORE)) db->db_partial_read = B_FALSE; - if (db->db_state == DB_CACHED) { + miss = (db->db_state != DB_CACHED); + + if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* - * Ensure that this block's dnode has been decrypted if - * the caller has requested decrypted data. + * Another reader came in while the dbuf was in flight between + * UNCACHED and CACHED. Either a writer will finish filling + * the buffer, sending the dbuf to CACHED, or the first reader's + * request will reach the read_done callback and send the dbuf + * to CACHED. Otherwise, a failure occurred and the dbuf will + * be sent to UNCACHED. */ - err = dbuf_read_verify_dnode_crypt(db, flags); + if (flags & DB_RF_NEVERWAIT) { + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + do { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, + zio_t *, pio); + cv_wait(&db->db_changed, &db->db_mtx); + } while (db->db_state == DB_READ || db->db_state == DB_FILL); + if (db->db_state == DB_UNCACHED) { + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + } + if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it @@ -1795,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if (err == 0 && db->db_buf != NULL && - (flags & DB_RF_NO_DECRYPT) == 0 && + if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { @@ -1810,80 +1837,49 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_FALSE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { - boolean_t need_wait = B_FALSE; - + } else { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - - if (zio == NULL && (db->db_state == DB_NOFILL || + if (pio == NULL && (db->db_state == DB_NOFILL || (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags, dblt, FTAG); - /* - * dbuf_read_impl has dropped db_mtx and our parent's rwlock - * for us - */ - if (!err && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - db->db_state != DB_CACHED, - flags & DB_RF_HAVESTRUCT); - } + err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); + /* dbuf_read_impl drops db_mtx and parent's rwlock. */ + miss = (db->db_state != DB_CACHED); + } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); - /* - * If we created a zio_root we must execute it to avoid - * leaking it, even if it isn't attached to any work due - * to an error in dbuf_read_impl(). - */ - if (need_wait) { - if (err == 0) - err = zio_wait(zio); - else - VERIFY0(zio_wait(zio)); - } - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_TRUE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); + /* + * If we created a zio we must execute it to avoid leaking it, even if + * it isn't attached to any work due to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(pio); + else + (void) zio_wait(pio); + pio = NULL; + } - /* Skip the wait per the caller's request. */ - if ((flags & DB_RF_NEVERWAIT) == 0) { - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, zio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); - } +done: + if (miss) + DBUF_STAT_BUMP(hash_misses); + else + DBUF_STAT_BUMP(hash_hits); + if (pio && err != 0) { + zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, + ZIO_FLAG_CANFAIL); + zio->io_error = err; + zio_nowait(zio); } return (err); @@ -2630,26 +2626,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* - * Quick check for dirtiness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). + * Quick check for dirtiness to improve performance for some workloads + * (e.g. file deletion with indirect blocks cached). */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* - * It's possible that it is already dirty but not cached, + * It's possible that the dbuf is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr != NULL) { - if (dr->dt.dl.dr_brtwrite) { + if (db->db_level == 0 && + dr->dt.dl.dr_brtwrite) { /* * Block cloning: If we are dirtying a cloned - * block, we cannot simply redirty it, because - * this dr has no data associated with it. + * level 0 block, we cannot simply redirty it, + * because this dr has no associated data. * We will go through a full undirtying below, * before dirtying it again. */ @@ -2832,7 +2826,7 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = dr->dr_txg; + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); } boolean_t @@ -2909,7 +2903,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = dr->dr_txg; + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); } void @@ -4174,21 +4168,6 @@ dmu_buf_get_objset(dmu_buf_t *db) return (dbi->db_objset); } -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -4611,11 +4590,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); - if (db->db_state != DB_NOFILL && + if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { + dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), @@ -4727,7 +4705,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (bp->blk_birth != 0) { + if (BP_GET_LOGICAL_BIRTH(bp) != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && @@ -4904,11 +4882,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != NULL && - dr->dt.dl.dr_data != db->db_buf) { - arc_buf_destroy(dr->dt.dl.dr_data, db); - } + if (dr->dt.dl.dr_data != NULL && + dr->dt.dl.dr_data != db->db_buf) { + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -5014,7 +4990,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = bp->blk_birth; + drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp); drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { @@ -5029,7 +5005,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg) { + BP_GET_LOGICAL_BIRTH(bp) > + ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -5110,21 +5087,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) os = dn->dn_objset; - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather than in + * dbuf_dirty() since they are only modified in the syncing + * context and we don't want the overhead of making multiple + * copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) + arc_buf_thaw(data); + else + dbuf_release_bp(db); + dbuf_remap(dn, db, tx); } if (parent != dn->dn_dbuf) { @@ -5151,7 +5125,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? @@ -5160,7 +5134,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + wp_flag |= (data == NULL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); @@ -5192,7 +5166,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { + } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index de8640e58a2c..4c53cb0a2f9b 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -437,7 +437,7 @@ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) for (int d = 0; d < SPA_DVAS_PER_BP; d++) ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); + ddp->ddp_phys_birth = BP_GET_BIRTH(bp); } void @@ -485,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) return (ddp); } return (NULL); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 77bb3c4d3854..8b440aafba43 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -570,8 +570,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { - if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + if (zs) { + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, + B_TRUE); + } rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) @@ -607,7 +609,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zfs_racct_write(length, nblks); if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { @@ -713,8 +715,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - int64_t level2 = level; - uint64_t start, end, start2, end2; if (dmu_prefetch_max == 0 || len == 0) { dmu_prefetch_dnode(os, object, pri); @@ -724,6 +724,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, if (dnode_hold(os, object, FTAG, &dn) != 0) return; + dmu_prefetch_by_dnode(dn, level, offset, len, pri); + + dnode_rele(dn, FTAG); +} + +void +dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + int64_t level2 = level; + uint64_t start, end, start2, end2; + /* * Depending on len we may do two prefetches: blocks [start, end) at * level, and following blocks [start2, end2) at higher level2. @@ -763,8 +775,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, for (uint64_t i = start2; i < end2; i++) dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); - - dnode_rele(dn, FTAG); } /* @@ -1620,7 +1630,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && - dr->dt.dl.dr_overridden_by.blk_birth == 0) + BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1651,7 +1661,7 @@ dmu_sync_late_arrival_done(zio_t *zio) blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } @@ -2258,11 +2268,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, if (bp == NULL) { /* - * The block was created in this transaction group, - * so it has no BP yet. + * The file size was increased, but the block was never + * written, otherwise we would either have the block + * pointer or the dirty record and would not get here. + * It is effectively a hole, so report it as such. */ - error = SET_ERROR(EAGAIN); - goto out; + BP_ZERO(&bps[i]); + continue; } /* * Make sure we clone only data blocks. @@ -2278,11 +2290,11 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * operation into ZIL, or it may be impossible to replay, since * the block may appear not yet allocated at that point. */ - if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { error = SET_ERROR(EINVAL); goto out; } - if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { error = SET_ERROR(EAGAIN); goto out; } @@ -2354,18 +2366,17 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; - dl->dr_brtwrite = B_TRUE; - dl->dr_override_state = DR_OVERRIDDEN; - if (BP_IS_HOLE(bp)) { - dl->dr_overridden_by.blk_birth = 0; - dl->dr_overridden_by.blk_phys_birth = 0; - } else { - dl->dr_overridden_by.blk_birth = dr->dr_txg; + if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { - dl->dr_overridden_by.blk_phys_birth = - BP_PHYSICAL_BIRTH(bp); + BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, + BP_GET_BIRTH(bp)); + } else { + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, + dr->dr_txg); } } + dl->dr_brtwrite = B_TRUE; + dl->dr_override_state = DR_OVERRIDDEN; mutex_exit(&db->db_mtx); @@ -2564,6 +2575,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); +EXPORT_SYMBOL(dmu_prefetch_by_dnode); +EXPORT_SYMBOL(dmu_prefetch_dnode); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f098e1daa44b..f1818ae155bd 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj) ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* - * The low 6 bits of the pointer don't have much entropy, because - * the objset_t is larger than 2^6 bytes long. + * The lower 11 bits of the pointer don't have much entropy, because + * the objset_t is more than 1KB long and so likely aligned to 2KB. */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; @@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg) sync_objset_arg_t *soa = sda->sda_soa; objset_t *os = soa->soa_os; + uint_t allocator = spa_acq_allocator(os->os_spa); multilist_sublist_t *ms = - multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); + spa_rel_allocator(os->os_spa, allocator); kmem_free(sda, sizeof (*sda)); @@ -2076,8 +2078,8 @@ userquota_updates_task(void *arg) dnode_t *dn; userquota_cache_t cache = { { 0 } }; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2159,8 +2161,8 @@ dnode_rele_task(void *arg) userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 54aa60259ea1..680aed4513bc 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1352,8 +1352,10 @@ corrective_read_done(zio_t *zio) { cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ - if (zio->io_error == 0) - spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth); + if (zio->io_error == 0) { + spa_remove_error(data->spa, &data->zb, + BP_GET_LOGICAL_BIRTH(zio->io_bp)); + } kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } @@ -1480,8 +1482,9 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, } rrd->abd = abd; - io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd, - BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); + io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, + abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, + &zb); ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || abd_get_size(abd) == BP_GET_PSIZE(bp)); @@ -2110,6 +2113,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); } + + /* + * If the receive fails, we want the resume stream to start with the + * same record that we last successfully received. There is no way to + * request resume from the object record, but we can benefit from the + * fact that sender always sends object record before anything else, + * after which it will "resend" data at offset 0 and resume normally. + */ + save_resume_state(rwa, drro->drr_object, 0, tx); + dmu_tx_commit(tx); return (0); @@ -2343,7 +2356,6 @@ receive_process_write_record(struct receive_writer_arg *rwa, if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; - dnode_t *dn; int flags = DB_RF_CANFAIL; if (rwa->raw) @@ -2375,19 +2387,15 @@ receive_process_write_record(struct receive_writer_arg *rwa, dmu_buf_rele(dbp, FTAG); return (err); } - dn = dmu_buf_dnode_enter(dbp); /* Make sure the on-disk block and recv record sizes match */ - if (drrw->drr_logical_size != - dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) { + if (drrw->drr_logical_size != dbp->db_size) { err = ENOTSUP; - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 37c68528bf95..b6cc2f0a5e91 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -619,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, /* See comment in dump_dnode() for full details */ if (zfs_send_unmodified_spill_blocks && - (bp->blk_birth <= dscp->dsc_fromtxg)) { + (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; } @@ -804,7 +804,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, */ if (zfs_send_unmodified_spill_blocks && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) { + (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { struct send_range record; blkptr_t *bp = DN_SPILL_BLKPTR(dnp); @@ -1123,7 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb, &bp->blk_birth); + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EIO)); } diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 809f7f6165f9..15cc2885e805 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) + if (claim_txg == 0 && + BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa)) return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -108,7 +109,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 || bp->blk_birth < claim_txg) + if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -192,7 +193,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, */ if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); - if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return (B_FALSE); @@ -235,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ASSERT(0); } - if (bp->blk_birth == 0) { + if (BP_GET_LOGICAL_BIRTH(bp) == 0) { /* * Since this block has a birth time of 0 it must be one of * two things: a hole created before the @@ -263,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); - } else if (bp->blk_birth <= td->td_min_txg) { + } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) { return (0); } diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 2b2d72671001..ed50f1889b59 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -65,9 +65,16 @@ unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; +/* max request reorder distance within a stream (default 16MB) */ +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +/* Max log2 fraction of holes in a stream */ +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_future; + kstat_named_t zfetchstat_stride; + kstat_named_t zfetchstat_past; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; @@ -76,6 +83,9 @@ typedef struct zfetch_stats { static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "future", KSTAT_DATA_UINT64 }, + { "stride", KSTAT_DATA_UINT64 }, + { "past", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, { "io_issued", KSTAT_DATA_UINT64 }, @@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = { struct { wmsum_t zfetchstat_hits; + wmsum_t zfetchstat_future; + wmsum_t zfetchstat_stride; + wmsum_t zfetchstat_past; wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; @@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw) return (EACCES); zs->zfetchstat_hits.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_hits); + zs->zfetchstat_future.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_future); + zs->zfetchstat_stride.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_stride); + zs->zfetchstat_past.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_past); zs->zfetchstat_misses.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_misses); zs->zfetchstat_max_streams.value.ui64 = @@ -122,6 +141,9 @@ void zfetch_init(void) { wmsum_init(&zfetch_sums.zfetchstat_hits, 0); + wmsum_init(&zfetch_sums.zfetchstat_future, 0); + wmsum_init(&zfetch_sums.zfetchstat_stride, 0); + wmsum_init(&zfetch_sums.zfetchstat_past, 0); wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); @@ -147,6 +169,9 @@ zfetch_fini(void) } wmsum_fini(&zfetch_sums.zfetchstat_hits); + wmsum_fini(&zfetch_sums.zfetchstat_future); + wmsum_fini(&zfetch_sums.zfetchstat_stride); + wmsum_fini(&zfetch_sums.zfetchstat_past); wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); @@ -222,22 +247,22 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs, *zs_next, *zs_old = NULL; - hrtime_t now = gethrtime(), t; + uint_t now = gethrestime_sec(), t; ASSERT(MUTEX_HELD(&zf->zf_lock)); /* * Delete too old streams, reusing the first found one. */ - t = now - SEC2NSEC(zfetch_max_sec_reap); + t = now - zfetch_max_sec_reap; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* * Skip if still active. 1 -- zf_stream reference. */ - if (zfs_refcount_count(&zs->zs_refs) != 1) + if ((int)(zs->zs_atime - t) >= 0) continue; - if (zs->zs_atime > t) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; if (zs_old) dmu_zfetch_stream_remove(zf, zs); @@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) } if (zs_old) { zs = zs_old; + list_remove(&zf->zf_stream, zs); goto reuse; } @@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) * for all the streams to be non-overlapping. */ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, - zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / zfetch_max_distance)); if (zf->zf_numstreams >= max_streams) { - t = now - SEC2NSEC(zfetch_min_sec_reap); + t = now - zfetch_min_sec_reap; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { - if (zfs_refcount_count(&zs->zs_refs) != 1) + if ((int)(zs->zs_atime - t) >= 0) continue; - if (zs->zs_atime > t) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; - if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) + if (zs_old == NULL || + (int)(zs_old->zs_atime - zs->zs_atime) >= 0) zs_old = zs; } if (zs_old) { zs = zs_old; + list_remove(&zf->zf_stream, zs); goto reuse; } ZFETCHSTAT_BUMP(zfetchstat_max_streams); @@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) } zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); - zs->zs_fetch = zf; zfs_refcount_create(&zs->zs_callers); zfs_refcount_create(&zs->zs_refs); /* One reference for zf_stream. */ zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; - list_insert_head(&zf->zf_stream, zs); reuse: + list_insert_head(&zf->zf_stream, zs); zs->zs_blkid = blkid; + /* Allow immediate stream reuse until first hit. */ + zs->zs_atime = now - zfetch_min_sec_reap; + memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); zs->zs_pf_dist = 0; + zs->zs_ipf_dist = 0; zs->zs_pf_start = blkid; zs->zs_pf_end = blkid; - zs->zs_ipf_dist = 0; zs->zs_ipf_start = blkid; zs->zs_ipf_end = blkid; - /* Allow immediate stream reuse until first hit. */ - zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); zs->zs_missed = B_FALSE; zs->zs_more = B_FALSE; } @@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); } +/* + * Process stream hit access for nblks blocks starting at zs_blkid. Return + * number of blocks to proceed for after aggregation with future ranges. + */ +static uint64_t +dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) +{ + uint_t i, j; + + /* Optimize sequential accesses (no future ranges). */ + if (zs->zs_ranges[0].start == 0) + goto done; + + /* Look for intersections with further ranges. */ + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > nblks) + break; + if (r->end >= nblks) { + nblks = r->end; + i++; + break; + } + } + + /* Delete all found intersecting ranges, updates remaining. */ + for (j = 0; i < ZFETCH_RANGES; i++, j++) { + if (zs->zs_ranges[i].start == 0) + break; + ASSERT3U(zs->zs_ranges[i].start, >, nblks); + ASSERT3U(zs->zs_ranges[i].end, >, nblks); + zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; + zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; + } + if (j < ZFETCH_RANGES) { + zs->zs_ranges[j].start = 0; + zs->zs_ranges[j].end = 0; + } + +done: + zs->zs_blkid += nblks; + return (nblks); +} + +/* + * Process future stream access for nblks blocks starting at blkid. Return + * number of blocks to proceed for if future ranges reach fill threshold. + */ +static uint64_t +dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) +{ + ASSERT3U(blkid, >, zs->zs_blkid); + blkid -= zs->zs_blkid; + ASSERT3U(blkid + nblks, <=, UINT16_MAX); + + /* Search for first and last intersection or insert point. */ + uint_t f = ZFETCH_RANGES, l = 0, i; + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > blkid + nblks) + break; + if (r->end < blkid) + continue; + if (f > i) + f = i; + if (l < i) + l = i; + } + if (f <= l) { + /* Got some intersecting range, expand it if needed. */ + if (zs->zs_ranges[f].start > blkid) + zs->zs_ranges[f].start = blkid; + zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); + if (f < l) { + /* Got more than one intersection, remove others. */ + for (f++, l++; l < ZFETCH_RANGES; f++, l++) { + zs->zs_ranges[f].start = zs->zs_ranges[l].start; + zs->zs_ranges[f].end = zs->zs_ranges[l].end; + } + zs->zs_ranges[f].start = 0; + zs->zs_ranges[f].end = 0; + } + } else if (i < ZFETCH_RANGES) { + /* Got no intersecting ranges, insert new one. */ + for (l = ZFETCH_RANGES - 1; l > i; l--) { + zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; + zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; + } + zs->zs_ranges[i].start = blkid; + zs->zs_ranges[i].end = blkid + nblks; + } else { + /* No space left to insert. Drop the range. */ + return (0); + } + + /* Check if with the new access addition we reached fill threshold. */ + if (zfetch_hole_shift >= 16) + return (0); + uint_t hole = 0; + for (i = f = l = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0) + break; + hole += r->start - f; + f = r->end; + if (hole <= r->end >> zfetch_hole_shift) + l = r->end; + } + if (l > 0) + return (dmu_zfetch_hit(zs, l)); + + return (0); +} + /* * This is the predictive prefetch entry point. dmu_zfetch_prepare() * associates dnode access specified with blkid and nblks arguments with @@ -370,53 +512,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, mutex_enter(&zf->zf_lock); /* - * Find matching prefetch stream. Depending on whether the accesses + * Find perfect prefetch stream. Depending on whether the accesses * are block-aligned, first block of the new access may either follow * the last block of the previous access, or be equal to it. */ + unsigned int dbs = zf->zf_dnode->dn_datablkshift; + uint64_t end_blkid = blkid + nblks; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid) { - break; + goto hit; } else if (blkid + 1 == zs->zs_blkid) { blkid++; nblks--; - break; + goto hit; } } /* - * If the file is ending, remove the matching stream if found. - * If not found then it is too late to create a new one now. + * Find close enough prefetch stream. Access crossing stream position + * is a hit in its new part. Access ahead of stream position considered + * a hit for metadata prefetch, since we do not care about fill percent, + * or stored for future otherwise. Access behind stream position is + * silently ignored, since we already skipped it reaching fill percent. */ - uint64_t end_of_access_blkid = blkid + nblks; - if (end_of_access_blkid >= maxblkid) { - if (zs != NULL) - dmu_zfetch_stream_remove(zf, zs); - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); + uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); + uint_t t = gethrestime_sec() - zfetch_max_sec_reap; + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid > zs->zs_blkid) { + if (end_blkid <= zs->zs_blkid + max_reorder) { + if (!fetch_data) { + nblks = dmu_zfetch_hit(zs, + end_blkid - zs->zs_blkid); + ZFETCHSTAT_BUMP(zfetchstat_stride); + goto future; + } + nblks = dmu_zfetch_future(zs, blkid, nblks); + if (nblks > 0) + ZFETCHSTAT_BUMP(zfetchstat_stride); + else + ZFETCHSTAT_BUMP(zfetchstat_future); + goto future; + } + } else if (end_blkid >= zs->zs_blkid) { + nblks -= zs->zs_blkid - blkid; + blkid += zs->zs_blkid - blkid; + goto hit; + } else if (end_blkid + max_reorder > zs->zs_blkid && + (int)(zs->zs_atime - t) >= 0) { + ZFETCHSTAT_BUMP(zfetchstat_past); + zs->zs_atime = gethrestime_sec(); + goto out; + } } - /* Exit if we already prefetched this block before. */ - if (nblks == 0) { - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); - } + /* + * This access is not part of any existing stream. Create a new + * stream for it unless we are at the end of file. + */ + if (end_blkid < maxblkid) + dmu_zfetch_stream_create(zf, end_blkid); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); - if (zs == NULL) { - /* - * This access is not part of any existing stream. Create - * a new stream for it. - */ - dmu_zfetch_stream_create(zf, end_of_access_blkid); +hit: + nblks = dmu_zfetch_hit(zs, nblks); + ZFETCHSTAT_BUMP(zfetchstat_hits); + +future: + zs->zs_atime = gethrestime_sec(); + + /* Exit if we already prefetched for this position before. */ + if (nblks == 0) + goto out; + + /* If the file is ending, remove the stream. */ + end_blkid = zs->zs_blkid; + if (end_blkid >= maxblkid) { + dmu_zfetch_stream_remove(zf, zs); +out: mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - ZFETCHSTAT_BUMP(zfetchstat_misses); return (NULL); } @@ -432,7 +613,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * than ~6% of ARC held by active prefetches. It should help with * getting out of RAM on some badly mispredicted read patterns. */ - unsigned int dbs = zf->zf_dnode->dn_datablkshift; unsigned int nbytes = nblks << dbs; unsigned int pf_nblks; if (fetch_data) { @@ -452,10 +632,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, } else { pf_nblks = 0; } - if (zs->zs_pf_start < end_of_access_blkid) - zs->zs_pf_start = end_of_access_blkid; - if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) - zs->zs_pf_end = end_of_access_blkid + pf_nblks; + if (zs->zs_pf_start < end_blkid) + zs->zs_pf_start = end_blkid; + if (zs->zs_pf_end < end_blkid + pf_nblks) + zs->zs_pf_end = end_blkid + pf_nblks; /* * Do the same for indirects, starting where we will stop reading @@ -473,9 +653,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; - zs->zs_blkid = end_of_access_blkid; - /* Protect the stream from reclamation. */ - zs->zs_atime = gethrtime(); zfs_refcount_add(&zs->zs_refs, NULL); /* Count concurrent callers. */ zfs_refcount_add(&zs->zs_callers, NULL); @@ -483,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - - ZFETCHSTAT_BUMP(zfetchstat_hits); return (zs); } void -dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, + boolean_t have_lock) { - zfetch_t *zf = zs->zs_fetch; int64_t pf_start, pf_end, ipf_start, ipf_end; int epbs, issued; @@ -567,7 +742,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); if (zs) - dmu_zfetch_run(zs, missed, have_lock); + dmu_zfetch_run(zf, zs, missed, have_lock); } ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, @@ -590,3 +765,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, + "Max request reorder distance within a stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, + "Max log2 fraction of holes in a stream"); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ba28aa06a91f..a703fd414f87 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2557,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || - db->db_blkptr->blk_birth <= txg || + BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree @@ -2605,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && - (hole || bp[i].blk_birth > txg)) + (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 4faefecbadbb..5fd8bc2a2682 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -1520,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * If the block was live (referenced) at the time of this * bookmark, add its space to the bookmark's FBN. */ - if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + if (BP_GET_LOGICAL_BIRTH(bp) <= + dbn->dbn_phys.zbm_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { mutex_enter(&dbn->dbn_lock); dbn->dbn_phys.zbm_referenced_freed_before_next_snap += diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 62a1649d3786..b4de0e7ff073 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -156,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } - ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); @@ -190,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg && + BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -236,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, mutex_exit(&ds->ds_remap_deadlist_lock); BP_ZERO(&fakebp); - fakebp.blk_birth = birth; + BP_SET_LOGICAL_BIRTH(&fakebp, birth); DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); @@ -259,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(bp->blk_birth <= tx->tx_txg); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); @@ -277,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg && + BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -285,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, bplist_append(&ds->ds_dir->dd_pending_frees, bp); } - if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); @@ -317,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); - /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + /* if (logical birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > + ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -2895,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - birth = dsl_dataset_get_blkptr(ds)->blk_birth; + birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds)); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index e6c8d4be13b4..eff1f7de7731 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -474,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); - dle_tofind.dle_mintxg = bp->blk_birth; + dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); @@ -483,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", - bp, (longlong_t)bp->blk_birth); + bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); dle = avl_first(&dl->dl_tree); } @@ -1039,8 +1039,7 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); ASSERT3U(BP_GET_CHECKSUM(bp), ==, BP_GET_CHECKSUM(&found->le_bp)); - ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==, - BP_PHYSICAL_BIRTH(&found->le_bp)); + ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); } if (bp_freed) { if (found == NULL) { diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index d9d88a981e05..d4a6e5b6e9fd 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -132,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); - if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) <= + dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && - bp->blk_birth > + BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); @@ -313,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, + tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); @@ -727,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } @@ -1017,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, + tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 370c6a010dca..342ec5c15c79 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -1047,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); + ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 060a5cc36d70..085cfd3c5691 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -429,8 +429,8 @@ sio2bp(const scan_io_t *sio, blkptr_t *bp) { memset(bp, 0, sizeof (*bp)); bp->blk_prop = sio->sio_blk_prop; - bp->blk_phys_birth = sio->sio_phys_birth; - bp->blk_birth = sio->sio_birth; + BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth); + BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth); bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; @@ -444,8 +444,8 @@ static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; - sio->sio_phys_birth = bp->blk_phys_birth; - sio->sio_birth = bp->blk_birth; + sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); + sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); @@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); + mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); @@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp) scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); + mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); @@ -1721,7 +1723,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp) || + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1730,7 +1733,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) + if (claim_txg == 0 && + BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], @@ -1756,7 +1760,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1764,7 +1768,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, * already txg sync'ed (but this log block contains * other records that are not synced) */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) + if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -1903,7 +1907,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + if (BP_IS_HOLE(bp) || + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; @@ -2174,7 +2179,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, &bp->blk_birth); + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2270,7 +2275,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, &bp->blk_birth); + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2347,7 +2352,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } @@ -2373,7 +2378,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; return; } @@ -2720,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -2912,8 +2919,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -4714,7 +4723,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + uint64_t phys_birth = BP_GET_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3a9250fe745f..cb004930d287 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { @@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) i--; break; } - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > @@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc) unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = - multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); + multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { - VERIFY3P(mls, ==, multilist_sublist_lock( + VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); @@ -5495,8 +5495,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; - bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); @@ -5845,8 +5846,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; - ASSERT(bp->blk_birth == 0); - ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); + ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); + ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -5900,7 +5901,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that @@ -5918,7 +5919,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; - if (bp->blk_birth <= spa->spa_checkpoint_txg && + if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 66bc0ae60b10..71122542758d 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -664,12 +664,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index b1cdf1c5c5f4..3d3ef86e6839 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml) return (random_in_range(ml->ml_num_sublists)); } +void +multilist_sublist_lock(multilist_sublist_t *mls) +{ + mutex_enter(&mls->mls_lock); +} + /* Lock and return the sublist specified at the given index */ multilist_sublist_t * -multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx) { multilist_sublist_t *mls; @@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) multilist_sublist_t * multilist_sublist_lock_obj(multilist_t *ml, void *obj) { - return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); + return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj))); } void @@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) list_insert_tail(&mls->mls_list, obj); } +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_after(&mls->mls_list, prev, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_before(&mls->mls_list, next, obj); +} + /* * Move the object one element forward in the list. * diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b144d0652930..560fd67087b6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -170,17 +170,22 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue - * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that - * need to be handled with minimum delay. + * and interrupt) and then to reserve threads for high priority I/Os that + * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT + * implementation, so separate high priority threads are used there. */ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ +#ifdef illumos { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ +#else + { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ +#endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; @@ -208,7 +213,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif -static uint_t zio_taskq_wr_iss_ncpus = 0; +static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -1067,17 +1072,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) case ZTI_MODE_SYNC: /* - * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', - * not to exceed the number of spa allocators. + * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, + * not to exceed the number of spa allocators, and align to it. */ - if (zio_taskq_wr_iss_ncpus == 0) { - count = MAX(boot_ncpus / spa->spa_alloc_count, 1); - } else { - count = MAX(1, - boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); - } + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); + while (spa->spa_alloc_count % count != 0 && + spa->spa_alloc_count < count * 2) + count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no @@ -1218,7 +1222,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) * * Example (the defaults for READ and WRITE) * zio_taskq_read='fixed,1,8 null scale null' - * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' + * zio_taskq_write='sync null scale null' * * Each sets the entire row at a time. * @@ -1495,15 +1499,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); - if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { - /* dispatch to assigned write issue taskq */ - tq = zio->io_wr_iss_tq; - return (tq); - } - if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; + } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } @@ -2655,8 +2655,8 @@ spa_claim_notify(zio_t *zio) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ - if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) - spa->spa_claim_max_txg = zio->io_bp->blk_birth; + if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) + spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } @@ -3273,8 +3273,6 @@ spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); @@ -3596,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; @@ -3645,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " - "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } - int interations = 0; + int iterations = 0; while ((now = gethrtime()) < import_expire) { - if (interations++ % 30 == 0) { + if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3739,6 +3746,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -4065,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -4981,7 +5044,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa) int error = 0; ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), @@ -5228,6 +5292,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; + hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -5272,13 +5337,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) return (error); } + /* + * Drop the namespace lock for the rest of the function. + */ + spa->spa_load_thread = curthread; + mutex_exit(&spa_namespace_lock); + /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -5291,7 +5362,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the full list of active features from the MOS and check if @@ -5300,7 +5371,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) - return (error); + goto fail; /* * Load several special directories from the MOS needed by the dsl_pool @@ -5309,7 +5380,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve pool properties from the MOS. @@ -5317,7 +5388,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - @@ -5326,7 +5397,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) - return (error); + goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices @@ -5335,17 +5406,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) - return (error); + goto fail; spa_import_progress_set_notes(spa, "Loading BRT"); error = spa_ld_load_brt(spa); if (error != 0) - return (error); + goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors @@ -5354,7 +5425,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) - return (error); + goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); @@ -5364,8 +5435,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); + error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP); + goto fail; } /* @@ -5376,7 +5448,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) - return (error); + goto fail; /* * Calculate the deflated space for the pool. This must be done before @@ -5501,13 +5573,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) spa_config_exit(spa, SCL_CONFIG, FTAG); spa_import_progress_set_notes(spa, "Finished importing"); } + zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); +fail: + mutex_enter(&spa_namespace_lock); + spa->spa_load_thread = NULL; + cv_broadcast(&spa_namespace_cv); + + return (error); - return (0); } static int @@ -6266,7 +6344,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *nvl; if (props == NULL || - nvlist_lookup_string(props, "tname", &poolname) != 0) + nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; /* @@ -6756,9 +6835,14 @@ spa_tryimport(nvlist_t *tryconfig) /* * Create and initialize the spa structure. */ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", + TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); + kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. @@ -6873,6 +6957,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, { int error; spa_t *spa; + hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; @@ -7017,6 +7102,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa->spa_is_exporting = B_FALSE; } + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + mutex_exit(&spa_namespace_lock); return (0); @@ -7082,7 +7170,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) * Add a device to a storage pool. */ int -spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { uint64_t txg, ndraid = 0; int error; @@ -7173,6 +7261,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } } + if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, + ZFS_ERR_ASHIFT_MISMATCH)); + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); @@ -8738,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8834,11 +8933,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -9801,7 +9900,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) * don't want to rely on that here). */ if (pass == 1 && - spa->spa_uberblock.ub_rootbp.blk_birth < txg && + BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this @@ -10134,16 +10233,10 @@ spa_sync_tq_create(spa_t *spa, const char *name) VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); - spa_taskqs_t *tqs = - &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; - spa_syncthread_info_t *ti = spa->spa_syncthreads; - for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; - if (w == tqs->stqs_count) { - w = 0; - } - ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); @@ -10162,6 +10255,42 @@ spa_sync_tq_destroy(spa_t *spa) spa->spa_sync_tq = NULL; } +uint_t +spa_acq_allocator(spa_t *spa) +{ + int i; + + if (spa->spa_alloc_count == 1) + return (0); + + mutex_enter(&spa->spa_allocs_use->sau_lock); + uint_t r = spa->spa_allocs_use->sau_rotor; + do { + if (++r == spa->spa_alloc_count) + r = 0; + } while (spa->spa_allocs_use->sau_inuse[r]); + spa->spa_allocs_use->sau_inuse[r] = B_TRUE; + spa->spa_allocs_use->sau_rotor = r; + mutex_exit(&spa->spa_allocs_use->sau_lock); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + ti->sti_allocator = r; + break; + } + } + ASSERT3S(i, <, spa->spa_alloc_count); + return (r); +} + +void +spa_rel_allocator(spa_t *spa, uint_t allocator) +{ + if (spa->spa_alloc_count > 1) + spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; +} + void spa_select_allocator(zio_t *zio) { @@ -10189,8 +10318,7 @@ spa_select_allocator(zio_t *zio) spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { - zio->io_allocator = i; - zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + zio->io_allocator = ti->sti_allocator; return; } } @@ -10207,7 +10335,6 @@ spa_select_allocator(zio_t *zio) bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; - zio->io_wr_iss_tq = NULL; } /* @@ -10778,10 +10905,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ @@ -10812,13 +10939,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, - spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, - spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif /* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, - "Number of CPUs to run write issue taskqs"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, + "Number of CPUs per write issue taskq"); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 244b4d264212..62d7b4fa2df2 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -180,7 +180,7 @@ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) * during spa_errlog_sync(). */ void -spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth) { spa_error_entry_t search; spa_error_entry_t *new; @@ -223,13 +223,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) new->se_zep.zb_object = zb->zb_object; new->se_zep.zb_level = zb->zb_level; new->se_zep.zb_blkid = zb->zb_blkid; - - /* - * birth may end up being NULL, e.g. in zio_done(). We - * will handle this in process_error_block(). - */ - if (birth != NULL) - new->se_zep.zb_birth = *birth; + new->se_zep.zb_birth = birth; } avl_insert(tree, new, where); @@ -258,7 +252,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, if (error == 0 && BP_IS_HOLE(&bp)) error = SET_ERROR(ENOENT); - *birth_txg = bp.blk_birth; + *birth_txg = BP_GET_LOGICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); @@ -535,7 +529,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); - spa_remove_error(spa, &zb, &zep->zb_birth); + spa_remove_error(spa, &zb, zep->zb_birth); } return (error); @@ -563,7 +557,7 @@ spa_get_last_errlog_size(spa_t *spa) */ static void spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, - const uint64_t *birth) + const uint64_t birth) { char name[NAME_MAX_LEN]; @@ -618,11 +612,7 @@ spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, healed_zep.zb_object = healed_zb->zb_object; healed_zep.zb_level = healed_zb->zb_level; healed_zep.zb_blkid = healed_zb->zb_blkid; - - if (birth != NULL) - healed_zep.zb_birth = *birth; - else - healed_zep.zb_birth = 0; + healed_zep.zb_birth = birth; errphys_to_name(&healed_zep, name, sizeof (name)); @@ -742,7 +732,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) * later in spa_remove_healed_errors(). */ void -spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth) +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth) { spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); @@ -890,7 +880,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, if (error == EACCES) error = 0; else if (!error) - zep.zb_birth = bp.blk_birth; + zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 873089a53e34..32158e8c592c 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -783,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * request of flushing everything before we attempt to return * immediately. */ - if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && !spa_flush_all_logs_requested(spa)) return; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 68b907614196..e6d4a9bdb29c 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -82,7 +82,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy/export + * - Held at the start and end of import * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -235,9 +236,9 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ -static avl_tree_t spa_namespace_avl; +avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; +kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; @@ -393,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; +static int spa_cpus_per_allocator = 4; /* * Spa active allocator. @@ -619,6 +621,7 @@ spa_lookup(const char *name) ASSERT(MUTEX_HELD(&spa_namespace_lock)); +retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -630,6 +633,14 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); + if (spa == NULL) + return (NULL); + + if (spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + goto retry; + } return (spa); } @@ -728,6 +739,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_config_lock_init(spa); spa_stats_init(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* @@ -736,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - /* Do not allow more allocators than CPUs. */ - spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus); + /* Do not allow more allocators than fraction of CPUs. */ + spa->spa_alloc_count = MAX(MIN(spa_num_allocators, + boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); @@ -747,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + if (spa->spa_alloc_count > 1) { + spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count]), KM_SLEEP); + mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, + NULL); + } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); @@ -826,7 +845,6 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -843,6 +861,11 @@ spa_remove(spa_t *spa) } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); + if (spa->spa_alloc_count > 1) { + mutex_destroy(&spa->spa_allocs_use->sau_lock); + kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count])); + } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); @@ -920,7 +943,8 @@ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } @@ -932,7 +956,8 @@ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } @@ -3085,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, - "Number of allocators per spa, capped by ncpus"); + "Number of allocators per spa"); + +ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, + "Minimum number of CPUs per allocators"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index a67c043446f5..5ce6be69be14 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -550,6 +550,15 @@ txg_sync_thread(void *arg) timer = (delta > timeout ? 0 : timeout - delta); } + /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c index 1921be107660..22ee8036c473 100644 --- a/module/zfs/uberblock.c +++ b/module/zfs/uberblock.c @@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay) } ub->ub_checkpoint_txg = 0; - return (ub->ub_rootbp.blk_birth == txg); + return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8e81b87ef380..414bf84f6f7a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1665,6 +1665,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1710,6 +1711,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1760,6 +1772,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1786,15 +1799,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) @@ -4925,11 +4929,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * TRIM ops and bytes are reported to user space as - * ZIO_TYPE_IOCTL. This is done to preserve the + * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) - vs_type = ZIO_TYPE_IOCTL; + vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' @@ -6240,12 +6244,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user - * space as ZIO_TYPE_IOCTL. This is done to + * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, - vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL], + vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: @@ -6276,12 +6280,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user - * space as ZIO_TYPE_IOCTL. This is done to + * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, - vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL], + vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index ec961255fd64..13bb33cc6871 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2548,24 +2548,20 @@ vdev_draid_read_config_spare(vdev_t *vd) } /* - * Handle any ioctl requested of the distributed spare. Only flushes - * are supported in which case all children must be flushed. + * Handle any flush requested of the distributed spare. All children must be + * flushed. */ static int -vdev_draid_spare_ioctl(zio_t *zio) +vdev_draid_spare_flush(zio_t *zio) { vdev_t *vd = zio->io_vd; int error = 0; - if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { - for (int c = 0; c < vd->vdev_children; c++) { - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[c], zio->io_offset, zio->io_abd, - zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_draid_spare_child_done, zio)); - } - } else { - error = SET_ERROR(ENOTSUP); + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); } return (error); @@ -2596,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: - zio->io_error = vdev_draid_spare_ioctl(zio); + case ZIO_TYPE_FLUSH: + zio->io_error = vdev_draid_spare_flush(zio); break; case ZIO_TYPE_WRITE: diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 5aaef1a69986..c5e16af16692 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include @@ -775,7 +775,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c31f48028bbc..ed592514fded 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -2027,6 +2027,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -2034,7 +2035,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index f9a01c9f53f4..102eacb03349 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -531,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int c, lowest_load; - ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); + ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index e416becbba18..15c8b8ca6016 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1891,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { - int n, i, c, t, tt; - int nmissing_rows; + int i, c, t, tt; + unsigned int n; + unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; @@ -2190,12 +2191,11 @@ vdev_raidz_close(vdev_t *vd) /* * Return the logical width to use, given the txg in which the allocation - * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the + * happened. Note that BP_GET_BIRTH() is usually the txg in which the * BP was allocated. Remapped BP's (that were relocated due to device - * removal, see remap_blkptr_cb()), will have a more recent - * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can - * ignore these because they can't be on RAIDZ (device removal doesn't - * support RAIDZ). + * removal, see remap_blkptr_cb()), will have a more recent physical birth + * which reflects when the BP was relocated, but we can ignore these because + * they can't be on RAIDZ (device removal doesn't support RAIDZ). */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) @@ -2295,7 +2295,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, rr->rr_size, - BP_PHYSICAL_BIRTH(zio->io_bp)); + BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -2518,7 +2518,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, - BP_PHYSICAL_BIRTH(zio->io_bp)); + BP_GET_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 6503390f7973..00ebd4c9fca4 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -23,6 +23,7 @@ * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024 by Delphix. All rights reserved. */ #include @@ -1071,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd) void vdev_rebuild_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 7e3c5f684703..9753d5a1ea04 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP * Copyright 2023 RackTop Systems, Inc. @@ -1148,7 +1148,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1568,8 +1569,8 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } diff --git a/module/zfs/zap.c b/module/zfs/zap.c index dde05d7005c2..1b6b16fc6662 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 Alexander Stetsenko + * Copyright (c) 2023, Klara Inc. */ /* @@ -41,6 +43,7 @@ #include #include +#include #include #include #include @@ -78,9 +81,16 @@ */ static int zap_iterate_prefetch = B_TRUE; +/* + * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be + * collapsed into a single block. + */ +int zap_shrink_enabled = B_TRUE; + int fzap_default_block_shift = 14; /* 16k blocksize */ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); void fzap_byteswap(void *vbuf, size_t size) @@ -133,7 +143,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * set up block 1 - the first leaf */ dmu_buf_t *db; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, 1<zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } @@ -193,21 +203,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, @@ -255,7 +265,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); @@ -267,7 +277,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { @@ -296,16 +306,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); - /* - * Note: this is equivalent to dmu_buf_hold(), but we use - * _dnode_enter / _by_dnode because it's faster because we don't - * have to hold the dnode. - */ - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -319,11 +322,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) */ blk = (idx*2) >> (bs-3); - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } @@ -368,7 +369,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) @@ -433,7 +434,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); @@ -533,10 +534,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, return (SET_ERROR(ENOENT)); int bs = FZAP_BLOCK_SHIFT(zap); - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); @@ -597,6 +596,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) } } +static int +zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + int epb = bs >> 3; /* entries per block */ + int err = 0; + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + /* + * Check for i/o errors + */ + for (int i = 0; i < nptrs; i += epb) { + uint64_t blk; + err = zap_idx_to_blk(zap, idx + i, &blk); + if (err != 0) { + return (err); + } + } + + for (int i = 0; i < nptrs; i++) { + err = zap_set_idx_to_blk(zap, idx + i, blk, tx); + ASSERT0(err); /* we checked for i/o errors above */ + if (err != 0) + break; + } + + return (err); +} + +#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) + +/* + * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. + * If two leaves are siblings, their ranges are adjecent and contain the same + * number of entries. In order to find out if a leaf has a sibling, we need to + * check the range corresponding to the sibling leaf. There is no need to check + * all entries in the range, we only need to check the frist and the last one. + */ +static uint64_t +check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; + uint64_t nptrs = (1 << pref_diff); + uint64_t first; + uint64_t last; + + ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + + if (zap_idx_to_blk(zap, idx, &first) != 0) + return (0); + + if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) + return (0); + + if (first != last) + return (0); + return (first); +} + static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { @@ -969,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx) if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); + + if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && + zap_shrink_enabled) + return (zap_shrink(zn, l, tx)); } zap_put_leaf(l); return (err); @@ -985,7 +1054,7 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } @@ -1228,18 +1297,24 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) */ if (zc->zc_hash == 0 && zap_iterate_prefetch && zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), ZIO_PRIORITY_ASYNC_READ); } - if (zc->zc_leaf && - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; + + /* + * The leaf was either shrunk or split. + */ + if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } } again: @@ -1248,8 +1323,6 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) &zc->zc_leaf); if (err != 0) return (err); - } else { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; @@ -1356,7 +1429,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); @@ -1366,7 +1439,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) dmu_buf_t *db; int err; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { @@ -1378,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } +/* + * Find last allocated block and update freeblk. + */ +static void +zap_trunc(zap_t *zap) +{ + uint64_t nentries; + uint64_t lastblk; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { + /* External ptrtbl */ + nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); + lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + } else { + /* Embedded ptrtbl */ + nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + lastblk = 0; + } + + for (uint64_t idx = 0; idx < nentries; idx++) { + uint64_t blk; + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + if (blk > lastblk) + lastblk = blk; + } + + ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); + + zap_f_phys(zap)->zap_freeblk = lastblk + 1; +} + +/* + * ZAP shrinking algorithm. + * + * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf + * only if it has a sibling. Sibling leaves have the same prefix length and + * their prefixes differ only by the least significant (sibling) bit. We require + * both siblings to be empty. This eliminates a need to rehash the non-empty + * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl + * entries of the removed leaf to point out to the remaining leaf. Prefix length + * of the remaining leaf is decremented. As a result, it has a new prefix and it + * might have a new sibling. So, we repeat the process. + * + * Steps: + * 1. Check if a sibling leaf (sl) exists and it is empty. + * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. + * 3. Release the sibling (sl) to derefer it again with WRITER lock. + * 4. Upgrade zapdir lock to WRITER (once). + * 5. Derefer released leaves again. + * 6. If it is needed, recheck whether both leaves are still siblings and empty. + * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of + * the remaining leaf (slbit 0). + * 8. Free disk block of the removed leaf (dmu_free_range). + * 9. Decrement prefix_len of the remaining leaf. + * 10. Repeat the steps. + */ +static int +zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + uint64_t hash = zn->zn_hash; + uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + boolean_t trunc = B_FALSE; + int err = 0; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); + + boolean_t writer = B_FALSE; + + /* + * To avoid deadlock always deref leaves in the same order - + * sibling 0 first, then sibling 1. + */ + while (prefix_len) { + zap_leaf_t *sl; + int64_t prefix_diff = zt_shift - prefix_len; + uint64_t sl_prefix = prefix ^ 1; + uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); + int slbit = prefix & 1; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + + /* + * Check if there is a sibling by reading ptrtbl ptrs. + */ + if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) + break; + + /* + * sibling 1, unlock it - we haven't yet dereferenced sibling 0. + */ + if (slbit == 1) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Dereference sibling leaf and check if it is empty. + */ + if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, + &sl)) != 0) + break; + + ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); + + /* + * Check if we have a sibling and it is empty. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || + zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { + zap_put_leaf(sl); + break; + } + + zap_put_leaf(sl); + + /* + * If there two empty sibling, we have work to do, so + * we need to lock ZAP ptrtbl as WRITER. + */ + if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { + /* We failed to upgrade */ + if (l != NULL) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Usually, the right way to upgrade from a READER lock + * to a WRITER lock is to call zap_unlockdir() and + * zap_lockdir(), but we do not have a tag. Instead, + * we do it in more sophisticated way. + */ + rw_exit(&zap->zap_rwlock); + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); + + zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + writer = B_TRUE; + } + + /* + * Here we have WRITER lock for ptrtbl. + * Now, we need a WRITER lock for both siblings leaves. + * Also, we have to recheck if the leaves are still siblings + * and still empty. + */ + if (l == NULL) { + /* sibling 0 */ + if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), + tx, RW_WRITER, &l)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) + break; + } + + /* sibling 1 */ + if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, + RW_WRITER, &sl)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { + zap_put_leaf(sl); + break; + } + + /* If we have gotten here, we have a leaf to collapse */ + uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; + uint64_t nptrs = (1ULL << prefix_diff); + uint64_t sl_blkid = sl->l_blkid; + + /* + * Set ptrtbl entries to point out to the slibling 0 blkid + */ + if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, + tx)) != 0) { + zap_put_leaf(sl); + break; + } + + /* + * Free sibling 1 disk block. + */ + int bs = FZAP_BLOCK_SHIFT(zap); + if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) + trunc = B_TRUE; + + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + sl_blkid << bs, 1 << bs, tx); + zap_put_leaf(sl); + + zap_f_phys(zap)->zap_num_leafs--; + + /* + * Update prefix and prefix_len. + */ + zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len--; + + prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + } + + if (trunc) + zap_trunc(zap); + + if (l != NULL) + zap_put_leaf(l); + + return (err); +} + /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, "When iterating ZAP object, prefetch it"); + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, + "Enable ZAP shrinking"); diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index e6afb1c58c95..032aca92695e 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -41,7 +41,8 @@ #include #include -static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, + uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ @@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) -static void -zap_memset(void *a, int c, size_t n) -{ - char *cp = a; - char *cpend = cp + n; - - while (cp < cpend) - *cp++ = c; -} - static void stv(int len, void *addr, uint64_t value) { @@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value) *(uint64_t *)addr = value; return; default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } } @@ -96,13 +87,13 @@ ldv(int len, const void *addr) case 8: return (*(uint64_t *)addr); default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } return (0xFEEDFACEDEADBEEFULL); } void -zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size) { zap_leaf_t l; dmu_buf_t l_dbuf; @@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; @@ -160,11 +151,11 @@ void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { l->l_bs = highbit64(l->l_dbuf->db_size) - 1; - zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } @@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l) { ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); @@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; - int byten = 0; + int byten = integer_size; uint64_t value = 0; int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); + if (len > 0) + value = ldv(integer_size, buf); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; la->la_type = ZAP_CHUNK_ARRAY; for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { - if (byten == 0) - value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; - if (++byten == integer_size) { - byten = 0; - buf += integer_size; + if (--byten == 0) { if (--len == 0) break; + byten = integer_size; + buf += integer_size; + value = ldv(integer_size, buf); } } @@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) *chunkp = CHAIN_END; while (chunk != CHAIN_END) { - int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); @@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, - int chunk, int array_numints) + uint_t chunk, int array_numints) { int bseen = 0; @@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint64_t valuelen = integer_size * num_integers; - int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (SET_ERROR(E2BIG)); @@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ - uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); + uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; @@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, */ static uint16_t * -zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry) { - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; @@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int nextchunk = la->la_next; + uint_t nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); @@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) } static void -zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); @@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ - (void) zap_leaf_rehash_entry(nl, chunk); + (void) zap_leaf_rehash_entry(nl, nle, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = @@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { - int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; + uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; @@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) @@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; @@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else - (void) zap_leaf_rehash_entry(l, i); + (void) zap_leaf_rehash_entry(l, le, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { - int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; @@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_n_tenths_full[n]++; - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { - int nentries = 0; - int chunk = zap_leaf_phys(l)->l_hash[i]; + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + uint_t nentries = 0; + uint_t chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 085d9cd8b4b6..d806988af96d 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -415,7 +415,7 @@ mze_destroy(zap_t *zap) } static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; @@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; + zap->zap_objset = dmu_buf_get_objset(db); + zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { @@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * have the specified tag. */ static int -zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, +zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); @@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, *zapp = NULL; - dmu_object_info_from_db(db, &doi); + dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { - zap = mzap_open(os, obj, db); + zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. @@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, } zap->zap_objset = os; + zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); @@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, zap_t **zapp) { dmu_buf_t *db; + int err; - int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) return (err); - } -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) dmu_buf_rele(db, tag); - } + else + VERIFY(dnode_add_ref(dn, tag)); return (err); } @@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { + dnode_t *dn; dmu_buf_t *db; + int err; - int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, tag); + return (err); } -#endif - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { dmu_buf_rele(db, tag); + dnode_rele(dn, tag); + } return (err); } @@ -645,6 +641,7 @@ void zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } @@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + VERIFY(dnode_add_ref(dn, FTAG)); + VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); @@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key, return (err); } +static int +zap_add_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, @@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, return (err); } -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +static int +zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, + const void *tag) { - zap_t *zap; + int err; - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } - err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); + err = fzap_update(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, const void *val, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) return (err); } +static int +zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, tag); + return (err); +} + int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) @@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 481af2ba826b..2f43c4aa41b8 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1096,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) return (B_FALSE); if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ + /* If this is not a read or write zio, ignore the error */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b2b06881bdd4..908b9efc1813 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -27,7 +27,7 @@ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -40,6 +40,7 @@ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright 2024 Oxide Computer Company */ /* @@ -1886,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { - error = spa_vdev_add(spa, config); + error = spa_vdev_add(spa, config, zc->zc_flags); nvlist_free(config); } spa_close(spa, FTAG); @@ -5345,8 +5346,9 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) + strchr(zc->zc_value, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5354,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &recvdprops)) != 0) - return (error); + zc->zc_iflags, &recvdprops)) != 0) { + goto out; + } if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &localprops)) != 0) - return (error); + zc->zc_iflags, &localprops)) != 0) { + goto out; + } if (zc->zc_string[0]) origin = zc->zc_string; @@ -5372,8 +5376,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); - nvlist_free(recvdprops); - nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop @@ -5389,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = SET_ERROR(EINVAL); } +out: nvlist_free(errors); + nvlist_free(recvdprops); + nvlist_free(localprops); return (error); } @@ -5456,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || - strchr(snapname, '%')) + strchr(snapname, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5481,15 +5487,15 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) - return (error); + goto out; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, @@ -5499,9 +5505,11 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); +out: nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); + nvlist_free(hidden_args); return (error); } @@ -5815,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 433a653e5500..fa4e7093ca46 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -895,7 +895,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, itx_t *itx; lr_clone_range_t *lr; uint64_t partlen, max_log_data; - size_t i, partnbps; + size_t partnbps; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; @@ -904,10 +904,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, while (nbps > 0) { partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); - partlen = 0; - for (i = 0; i < partnbps; i++) { - partlen += BP_GET_LSIZE(&bps[i]); - } + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); partlen = MIN(partlen, len); itx = zil_itx_create(txtype, diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 7f39ad6fc775..babb07ca25a9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -123,7 +123,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) /* Flush any mmap()'d data to disk */ if (zn_has_cached_data(zp, 0, file_sz - 1)) - zn_flush_cached_data(zp, B_FALSE); + zn_flush_cached_data(zp, B_TRUE); lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); @@ -1187,6 +1187,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } } + /* Flush any mmap()'d data to disk */ + if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) + zn_flush_cached_data(inzp, B_TRUE); + /* * Maintain predictable lock order. */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index e549e1895f39..34be54b337fd 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -125,10 +125,9 @@ static kstat_t *zil_kstats_global; int zil_replay_disable = 0; /* - * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to - * the disk(s) by the ZIL after an LWB write has completed. Setting this - * will cause ZIL corruption on power loss if a volatile out-of-order - * write cache is enabled. + * Disable the flush commands that are normally sent to the disk(s) by the ZIL + * after an LWB write has completed. Setting this will cause ZIL corruption on + * power loss if a volatile out-of-order write cache is enabled. */ static int zil_nocacheflush = 0; @@ -557,7 +556,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ - if (bp->blk_birth >= first_txg) + if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) @@ -583,7 +582,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ - if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || + if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); @@ -608,7 +607,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ - if (lr->lr_blkptr.blk_birth >= first_txg) { + if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); @@ -655,7 +654,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, * just in case lets be safe and just stop here now instead of * corrupting the pool. */ - if (BP_PHYSICAL_BIRTH(bp) >= first_txg) + if (BP_GET_BIRTH(bp) >= first_txg) return (SET_ERROR(ENOENT)); /* @@ -710,8 +709,8 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) /* * If we previously claimed it, we need to free it. */ - if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && - !BP_IS_HOLE(bp)) { + if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg && + zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } @@ -1406,19 +1405,17 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) } /* - * This function is a called after all vdevs associated with a given lwb - * write have completed their DKIOCFLUSHWRITECACHE command; or as soon - * as the lwb write completes, if "zil_nocacheflush" is set. Further, - * all "previous" lwb's will have completed before this function is - * called; i.e. this function is called for all previous lwbs before - * it's called for "this" lwb (enforced via zio the dependencies - * configured in zil_lwb_set_zio_dependency()). + * This function is a called after all vdevs associated with a given lwb write + * have completed their flush command; or as soon as the lwb write completes, + * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have + * completed before this function is called; i.e. this function is called for + * all previous lwbs before it's called for "this" lwb (enforced via zio the + * dependencies configured in zil_lwb_set_zio_dependency()). * - * The intention is for this function to be called as soon as the - * contents of an lwb are considered "stable" on disk, and will survive - * any sudden loss of power. At this point, any threads waiting for the - * lwb to reach this state are signalled, and the "waiter" structures - * are marked "done". + * The intention is for this function to be called as soon as the contents of + * an lwb are considered "stable" on disk, and will survive any sudden loss of + * power. At this point, any threads waiting for the lwb to reach this state + * are signalled, and the "waiter" structures are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) @@ -1532,17 +1529,16 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) } /* - * This is called when an lwb's write zio completes. The callback's - * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs - * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved - * in writing out this specific lwb's data, and in the case that cache - * flushes have been deferred, vdevs involved in writing the data for - * previous lwbs. The writes corresponding to all the vdevs in the - * lwb_vdev_tree will have completed by the time this is called, due to - * the zio dependencies configured in zil_lwb_set_zio_dependency(), - * which takes deferred flushes into account. The lwb will be "done" - * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio - * completion callback for the lwb's root zio. + * This is called when an lwb's write zio completes. The callback's purpose is + * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The + * tree will contain the vdevs involved in writing out this specific lwb's + * data, and in the case that cache flushes have been deferred, vdevs involved + * in writing the data for previous lwbs. The writes corresponding to all the + * vdevs in the lwb_vdev_tree will have completed by the time this is called, + * due to the zio dependencies configured in zil_lwb_set_zio_dependency(), + * which takes deferred flushes into account. The lwb will be "done" once + * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion + * callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) @@ -1601,19 +1597,18 @@ zil_lwb_write_done(zio_t *zio) } /* - * If this lwb does not have any threads waiting for it to - * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE - * command to the vdevs written to by "this" lwb, and instead - * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE - * command for those vdevs. Thus, we merge the vdev tree of - * "this" lwb with the vdev tree of the "next" lwb in the list, - * and assume the "next" lwb will handle flushing the vdevs (or - * deferring the flush(s) again). + * If this lwb does not have any threads waiting for it to complete, we + * want to defer issuing the flush command to the vdevs written to by + * "this" lwb, and instead rely on the "next" lwb to handle the flush + * command for those vdevs. Thus, we merge the vdev tree of "this" lwb + * with the vdev tree of the "next" lwb in the list, and assume the + * "next" lwb will handle flushing the vdevs (or deferring the flush(s) + * again). * - * This is a useful performance optimization, especially for - * workloads with lots of async write activity and few sync - * write and/or fsync activity, as it has the potential to - * coalesce multiple flush commands to a vdev into one. + * This is a useful performance optimization, especially for workloads + * with lots of async write activity and few sync write and/or fsync + * activity, as it has the potential to coalesce multiple flush + * commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); @@ -1663,16 +1658,16 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of - * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. + * how we can defer the flush commands for each lwb. * - * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous - * lwb will rely on this lwb to flush the vdevs written to by that - * previous lwb. Thus, we need to ensure this lwb doesn't issue the - * flush until after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child relationship here. + * When the flush commands are deferred, the previous lwb will rely on + * this lwb to flush the vdevs written to by that previous lwb. Thus, + * we need to ensure this lwb doesn't issue the flush until after the + * previous lwb's write completes. We ensure this ordering by setting + * the zio parent/child relationship here. * - * Without this relationship on the lwb's write zio, it's possible - * for this lwb's write to complete prior to the previous lwb's write + * Without this relationship on the lwb's write zio, it's possible for + * this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, @@ -1965,7 +1960,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) &slog); } if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -3499,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion - * callback of the root zio for the DKIOCFLUSHWRITECACHE commands - * that are sent to the vdevs upon completion of the lwb zio. + * callback of the root zio for the flush commands that are sent to + * the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 213fe5c483f2..65a0afaaa21c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -23,7 +23,7 @@ * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ @@ -63,7 +63,7 @@ const char *const zio_type_name[ZIO_TYPES] = { * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ - "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; @@ -613,7 +613,7 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, - &zio->io_bp->blk_birth); + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } @@ -803,9 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, /* * If we can tell the caller to execute this parent next, do - * so. We only do this if the parent's zio type matches the - * child's type. Otherwise dispatch the parent zio in its - * own taskq. + * so. We do this if the parent's zio type matches the child's + * type, or if it's a zio_null() with no done callback, and so + * has no actual work to do. Otherwise dispatch the parent zio + * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch @@ -825,7 +826,8 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && - pio->io_type == zio->io_type) { + (pio->io_type == zio->io_type || + (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); @@ -1052,8 +1054,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], - (long long)bp->blk_phys_birth, - (long long)bp->blk_birth, + (long long)BP_GET_PHYSICAL_BIRTH(bp), + (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], @@ -1156,10 +1158,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, /* * Pool-specific checks. * - * Note: it would be nice to verify that the blk_birth and - * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() - * allows the birth time of log blocks (and dmu_sync()-ed blocks - * that are in the log) to be arbitrarily large. + * Note: it would be nice to verify that the logical birth + * and physical birth are not too large. However, + * spa_freeze() allows the birth time of log blocks (and + * dmu_sync()-ed blocks that are in the log) to be arbitrarily + * large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; @@ -1246,7 +1249,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, { zio_t *zio; - zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, + zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? @@ -1435,7 +1438,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ - ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, + ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ @@ -1448,17 +1451,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, return (zio); } -zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, zio_flag_t flags) -{ - zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - zio->io_cmd = cmd; - return (zio); -} - zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, @@ -1625,15 +1617,25 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, return (zio); } + +/* + * Send a flush command to the given vdev. Unlike most zio creation functions, + * the flush zios are issued immediately. You can wait on pio to pause until + * the flushes complete. + */ void zio_flush(zio_t *pio, vdev_t *vd) { + const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY; + if (vd->vdev_nowritecache) return; + if (vd->vdev_children == 0) { - zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd, - DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); + zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, + NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, + NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); @@ -1731,7 +1733,7 @@ zio_write_bp_init(zio_t *zio) blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; - ASSERT(bp->blk_birth != zio->io_txg); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1819,7 +1821,7 @@ zio_write_compress(zio_t *zio) ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { + if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -1866,7 +1868,7 @@ zio_write_compress(zio_t *zio) BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); - bp->blk_birth = zio->io_txg; + BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); @@ -1947,7 +1949,7 @@ zio_write_compress(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && + if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); @@ -1961,7 +1963,7 @@ zio_write_compress(zio_t *zio) } if (psize == 0) { - if (zio->io_bp_orig.blk_birth != 0 && + if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); @@ -2039,12 +2041,14 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) /* * If this is a high priority I/O, then use the high priority taskq if - * available. + * available or cut the line otherwise. */ - if ((zio->io_priority == ZIO_PRIORITY_NOW || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && - spa->spa_zio_taskq[t][q + 1].stqs_count != 0) - q++; + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { + if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + else + flags |= TQ_FRONT; + } ASSERT3U(q, <, ZIO_TASKQ_TYPES); @@ -2532,8 +2536,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " - "failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " + "I/O failure and has been suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); @@ -2921,7 +2927,6 @@ static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; - cio->io_wr_iss_tq = pio->io_wr_iss_tq; } static void @@ -3539,7 +3544,7 @@ zio_ddt_write(zio_t *zio) else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { - ASSERT(bp->blk_birth == txg); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); @@ -3810,11 +3815,13 @@ zio_dva_claim(zio_t *zio) static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp)) - metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); + if (!BP_IS_HOLE(bp)) { + metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), + B_TRUE); + } if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { @@ -4056,6 +4063,16 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { + /* + * "no-op" injections return success, but do no actual + * work. Just skip the remaining vdev stages. + */ + zio_vdev_io_bypass(zio); + zio_interrupt(zio); + return (NULL); + } + if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -4083,14 +4100,17 @@ zio_vdev_io_done(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FLUSH || + zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { - vdev_queue_io_done(zio); + if (zio->io_type != ZIO_TYPE_FLUSH) + vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, @@ -4099,7 +4119,8 @@ zio_vdev_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); - if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { + if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && + zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { @@ -4234,8 +4255,7 @@ zio_vdev_io_assess(zio_t *zio) * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) @@ -4555,8 +4575,8 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || - (zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || + BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); @@ -4852,7 +4872,7 @@ zio_done(zio_t *zio) * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, - &zio->io_bp->blk_birth); + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index e511b31fee6d..ce6772a40c8b 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -272,7 +272,7 @@ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = BP_PHYSICAL_BIRTH(bp); + uint64_t txg = BP_GET_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 609182f4a2cd..012a0e3c6c17 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0; typedef struct inject_handler { int zi_id; spa_t *zi_spa; + char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; @@ -364,10 +366,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) int ret = 0; /* - * We skip over faults in the labels unless it's during - * device open (i.e. zio == NULL). + * We skip over faults in the labels unless it's during device open + * (i.e. zio == NULL) or a device flush (offset is meaningless) */ - if (zio != NULL) { + if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || @@ -605,9 +607,11 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + /* also match on I/O type (e.g., -T read) */ if (handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) - continue; + handler->zi_record.zi_iotype != zio->io_type) { + continue; + } /* * Defensive; should never happen as the array allocation @@ -703,6 +707,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static void +zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) +{ + inject_handler_t *handler; + hrtime_t delay = 0; + int id = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); + handler != NULL && handler->zi_record.zi_cmd == command; + handler = list_next(&inject_handlers, handler)) { + ASSERT3P(handler->zi_spa_name, !=, NULL); + if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + uint64_t pause = + SEC2NSEC(handler->zi_record.zi_duration); + if (pause > elapsed) { + delay = pause - elapsed; + } + id = handler->zi_id; + break; + } + } + + rw_exit(&inject_lock); + + if (delay) { + if (command == ZINJECT_DELAY_IMPORT) { + spa_import_progress_set_notes(spa, "injecting %llu " + "sec delay", (u_longlong_t)NSEC2SEC(delay)); + } + zfs_sleep_until(gethrtime() + delay); + } + if (id) { + /* all done with this one-shot handler */ + zio_clear_fault(id); + } +} + +/* + * For testing, inject a delay during an import + */ +void +zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); +} + +/* + * For testing, inject a delay during an export + */ +void +zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { @@ -760,6 +821,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record) return (0); } +static boolean_t +zio_pool_handler_exists(const char *name, zinject_type_t command) +{ + boolean_t exists = B_FALSE; + + rw_enter(&inject_lock, RW_READER); + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (command != handler->zi_record.zi_cmd) + continue; + + const char *pool = (handler->zi_spa_name != NULL) ? + handler->zi_spa_name : spa_name(handler->zi_spa); + if (strcmp(name, pool) == 0) { + exists = B_TRUE; + break; + } + } + rw_exit(&inject_lock); + + return (exists); +} /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -810,16 +893,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (!(flags & ZINJECT_NULL)) { /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. + * Pool delays for import or export don't take an + * injection reference on the spa. Instead they + * rely on matching by name. */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); + if (record->zi_cmd == ZINJECT_DELAY_IMPORT || + record->zi_cmd == ZINJECT_DELAY_EXPORT) { + if (record->zi_duration <= 0) + return (SET_ERROR(EINVAL)); + /* + * Only one import | export delay handler per pool. + */ + if (zio_pool_handler_exists(name, record->zi_cmd)) + return (SET_ERROR(EEXIST)); + + mutex_enter(&spa_namespace_lock); + boolean_t has_spa = spa_lookup(name) != NULL; + mutex_exit(&spa_namespace_lock); + + if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) + return (SET_ERROR(EEXIST)); + if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) + return (SET_ERROR(ENOENT)); + spa = NULL; + } else { + /* + * spa_inject_ref() will add an injection reference, + * which will prevent the pool from being removed + * from the namespace while still allowing it to be + * unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + } handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; + handler->zi_spa = spa; /* note: can be NULL */ handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { @@ -832,6 +941,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler->zi_next_lane = 0; } + if (handler->zi_spa == NULL) + handler->zi_spa_name = spa_strdup(name); + else + handler->zi_spa_name = NULL; + rw_enter(&inject_lock, RW_WRITER); /* @@ -891,7 +1005,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen, if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strlcpy(name, spa_name(handler->zi_spa), buflen); + ASSERT(handler->zi_spa || handler->zi_spa_name); + if (handler->zi_spa != NULL) + (void) strlcpy(name, spa_name(handler->zi_spa), buflen); + else + (void) strlcpy(name, handler->zi_spa_name, buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); @@ -941,7 +1059,11 @@ zio_clear_fault(int id) ASSERT3P(handler->zi_lanes, ==, NULL); } - spa_inject_delref(handler->zi_spa); + if (handler->zi_spa_name != NULL) + spa_strfree(handler->zi_spa_name); + + if (handler->zi_spa != NULL) + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 9c836786baea..876c198c64de 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -17,7 +17,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) # by generating a preamble text file which kmodtool can append to the spec file. %(/bin/echo -e "\ Requires: @PACKAGE@ = %{version}\n\ -Conflicts: @PACKAGE@-dkms) +Conflicts: @PACKAGE@-dkms" > %{_sourcedir}/kmod-preamble) # LDFLAGS are not sanitized by arch/*/Makefile for these architectures. %ifarch ppc ppc64 ppc64le aarch64 diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 179e24d7a0ef..c25903ea1bee 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh} PROG=zfs-tests.sh VERBOSE="no" QUIET="" +DEBUG="" CLEANUP="yes" CLEANUPALL="no" KMSG="" @@ -313,6 +314,7 @@ OPTIONS: -h Show this message -v Verbose zfs-tests.sh output -q Quiet test-runner output + -D Debug; show all test output immediately (noisy) -x Remove all testpools, dm, lo, and files (unsafe) -k Disable cleanup after test failure -K Log test names to /dev/kmsg @@ -326,7 +328,8 @@ OPTIONS: -d DIR Use world-writable DIR for files and loopback devices -s SIZE Use vdevs of SIZE (default: 4G) -r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES}) - -t PATH Run single test at PATH relative to test suite + -t PATH|NAME Run single test at PATH relative to test suite, + or search for test by NAME -T TAGS Comma separated list of tags (default: 'functional') -u USER Run single test as USER (default: root) @@ -340,6 +343,9 @@ $0 -r linux-fast # Run a single test $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh +# Run a single test by name +$0 -t zfs_bookmark_cliargs + # Cleanup a previous run of the test suite prior to testing, run the # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit. $0 -x @@ -347,7 +353,7 @@ $0 -x EOF } -while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do +while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do case $OPTION in h) usage @@ -393,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do d) FILEDIR="$OPTARG" ;; + D) + DEBUG="yes" + ;; I) ITERATIONS="$OPTARG" if [ "$ITERATIONS" -le 0 ]; then @@ -450,8 +459,15 @@ post_user = root post = outputdir = /var/tmp/test_results EOF - SINGLETESTDIR="${SINGLETEST%/*}" + if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then + NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit) + if [ -z "$NEWSINGLETEST" ] ; then + fail "couldn't find test matching '$SINGLETEST'" + fi + SINGLETEST=$NEWSINGLETEST + fi + SINGLETESTDIR="${SINGLETEST%/*}" SETUPDIR="$SINGLETESTDIR" [ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR" [ -x "$SETUPDIR/setup.ksh" ] && SETUPSCRIPT="setup" || SETUPSCRIPT= @@ -680,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report) # msg "${TEST_RUNNER}" \ "${QUIET:+-q}" \ + "${DEBUG:+-D}" \ "${KMEMLEAK:+-m}" \ "${KMSG:+-K}" \ "-c \"${RUNFILES}\"" \ @@ -689,6 +706,7 @@ msg "${TEST_RUNNER}" \ { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ ${KMSG:+-K} \ -c "${RUNFILES}" \ @@ -715,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then { PATH=$STF_PATH \ ${TEST_RUNNER} \ ${QUIET:+-q} \ + ${DEBUG:+-D} \ ${KMEMLEAK:+-m} \ -c "${RUNFILES}" \ -T "${TAGS}" \ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 502b4de2bae9..5e7fdf359a75 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -153,6 +153,12 @@ tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] +[tests/functional/cli_root/zinject] +tests = ['zinject_args'] +pre = +post = +tags = ['functional', 'cli_root', 'zinject'] + [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', @@ -246,7 +252,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', - 'zfs_mount_test_race'] + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] @@ -372,7 +378,8 @@ tags = ['functional', 'cli_root', 'zpool'] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] + 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output', + 'zpool_add--allow-ashift-mismatch'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] @@ -459,7 +466,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced', - 'zpool_import_status'] + 'zpool_import_status', 'zpool_import_parallel_pos', + 'zpool_import_parallel_neg', 'zpool_import_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 @@ -635,6 +643,10 @@ tags = ['functional', 'compression'] tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] +[tests/functional/zap_shrink] +tests = ['zap_shrink_001_pos'] +tags = ['functional', 'zap_shrink'] + [tests/functional/crtime] tests = ['crtime_001_pos' ] tags = ['functional', 'crtime'] @@ -964,6 +976,12 @@ tests = [ 'userspace_send_encrypted', 'userspace_encrypted_13709'] tags = ['functional', 'userquota'] +[tests/functional/vdev_disk:Linux] +pre = +post = +tests = ['page_alignment'] +tags = ['functional', 'vdev_disk'] + [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a0b74ef4a8c6..92ce09ec6fcb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -146,7 +146,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index ab41c05b8473..d6a791e3375d 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] + 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] @@ -599,6 +600,12 @@ tags = ['functional', 'truncate'] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] +[tests/functional/vdev_disk:Linux] +pre = +post = +tests = ['page_alignment'] +tags = ['functional', 'vdev_disk'] + [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos'] diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 422ebd7bc8bf..65247f4f06fc 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -113,8 +113,9 @@ class Output(object): This class is a slightly modified version of the 'Stream' class found here: http://goo.gl/aSGfv """ - def __init__(self, stream): + def __init__(self, stream, debug=False): self.stream = stream + self.debug = debug self._buf = b'' self.lines = [] @@ -140,6 +141,8 @@ class Output(object): buf = os.read(fd, 4096) if not buf: return None + if self.debug: + os.write(sys.stderr.fileno(), buf) if b'\n' not in buf: self._buf += buf return [] @@ -238,14 +241,14 @@ User: %s ret = '%s -E -u %s %s' % (SUDO, user, cmd) return ret.split(' ') - def collect_output(self, proc): + def collect_output(self, proc, debug=False): """ Read from stdout/stderr as data becomes available, until the process is no longer running. Return the lines from the stdout and stderr Output objects. """ - out = Output(proc.stdout) - err = Output(proc.stderr) + out = Output(proc.stdout, debug) + err = Output(proc.stderr, debug) res = [] while proc.returncode is None: proc.poll() @@ -308,7 +311,10 @@ User: %s try: t.start() - self.result.stdout, self.result.stderr = self.collect_output(proc) + + out, err = self.collect_output(proc, options.debug) + self.result.stdout = out + self.result.stderr = err if kmemleak: cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"' @@ -624,7 +630,7 @@ Tags: %s class TestRun(object): - props = ['quiet', 'outputdir'] + props = ['quiet', 'outputdir', 'debug'] def __init__(self, options): self.tests = {} @@ -644,7 +650,8 @@ class TestRun(object): ('post_user', ''), ('failsafe', ''), ('failsafe_user', ''), - ('tags', []) + ('tags', []), + ('debug', False) ] def __str__(self): @@ -1067,6 +1074,8 @@ def parse_args(): help='Specify tests to run via config files.') parser.add_option('-d', action='store_true', default=False, dest='dryrun', help='Dry run. Print tests, but take no other action.') + parser.add_option('-D', action='store_true', default=False, dest='debug', + help='Write all test output to stdout as it arrives.') parser.add_option('-l', action='callback', callback=options_cb, default=None, dest='logfile', metavar='logfile', type='string', diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am index 3dd1a6452728..40a361d582a2 100644 --- a/tests/zfs-tests/Makefile.am +++ b/tests/zfs-tests/Makefile.am @@ -13,6 +13,9 @@ scripts_zfs_tests_functional_hkdf_PROGRAMS = %D%/tests/functional/hkdf/hkdf_test %C%_tests_functional_hkdf_hkdf_test_LDADD = \ libzpool.la +scripts_zfs_tests_functional_vdev_diskdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/vdev_disk +scripts_zfs_tests_functional_vdev_disk_PROGRAMS = %D%/tests/functional/vdev_disk/page_alignment + scripts_zfs_tests_functional_cp_filesdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/cp_files scripts_zfs_tests_functional_cp_files_PROGRAMS = %D%/tests/functional/cp_files/seekflood diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index fe9c92108725..d625c040b819 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -606,6 +606,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ + functional/cli_root/zinject/zinject_args.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ @@ -769,6 +770,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ + functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ @@ -988,6 +990,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ + functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ @@ -1141,6 +1144,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_status.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \ + functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ @@ -1587,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ + functional/mmp/mmp_write_slow_disk.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ @@ -2068,6 +2075,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ + functional/zap_shrink/cleanup.ksh \ + functional/zap_shrink/zap_shrink_001_pos.ksh \ + functional/zap_shrink/setup.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib index 3b8eaea5bb54..84b92b4dcdc9 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib @@ -97,20 +97,19 @@ function verify_pool_prop_eq function verify_pool_props { - typeset -r dsize=$1 - typeset -r ratio=$2 + typeset -r oused=$1 + typeset -r osaved=$2 + typeset dsize=$3 + typeset ratio=$4 if [[ $dsize -eq 0 ]]; then - verify_pool_prop_eq bcloneused 0 - verify_pool_prop_eq bclonesaved 0 - verify_pool_prop_eq bcloneratio 1.00 - else - if [[ $ratio -eq 1 ]]; then - verify_pool_prop_eq bcloneused 0 - else - verify_pool_prop_eq bcloneused $dsize - fi - verify_pool_prop_eq bclonesaved $((dsize*(ratio-1))) + ratio=1 + elif [[ $ratio -eq 1 ]]; then + dsize=0 + fi + verify_pool_prop_eq bcloneused $(($oused+$dsize)) + verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1))) + if [[ $oused -eq 0 ]]; then verify_pool_prop_eq bcloneratio "${ratio}.00" fi } @@ -124,16 +123,22 @@ function bclone_test typeset -r srcdir=$4 typeset -r dstdir=$5 typeset dsize + typeset oused + typeset osaved typeset -r original="${srcdir}/original" typeset -r clone="${dstdir}/clone" log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded" + # Save current block cloning stats for later use. + sync_pool $TESTPOOL + oused=$(get_pool_prop bcloneused $TESTPOOL) + osaved=$(get_pool_prop bclonesaved $TESTPOOL) + # Create a test file with known content. case $datatype in random|text) - sync_pool $TESTPOOL if [[ $datatype = "random" ]]; then dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null else @@ -146,13 +151,13 @@ function bclone_test sync_pool $TESTPOOL # It is hard to predict block sizes that will be used, # so just do one clone and take it from bcloneused. - filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL) + dsize=$(get_pool_prop bcloneused $TESTPOOL) + dsize=$(($dsize-$oused)) if [[ $embedded = "false" ]]; then - log_must test $filesize -gt 0 + log_must test $dsize -gt 0 fi rm -f "${clone}-tmp" sync_pool $TESTPOOL - dsize=$filesize ;; hole) log_must truncate_test -s $filesize -f $original @@ -217,7 +222,7 @@ function bclone_test test_file_integrity $original_checksum "${clone}4" $filesize test_file_integrity $original_checksum "${clone}5" $filesize - verify_pool_props $dsize 7 + verify_pool_props $oused $osaved $dsize 7 # Clear cache and test after fresh import. log_must zpool export $TESTPOOL @@ -240,7 +245,7 @@ function bclone_test sync_pool $TESTPOOL - verify_pool_props $dsize 11 + verify_pool_props $oused $osaved $dsize 11 log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL @@ -268,7 +273,7 @@ function bclone_test test_file_integrity $original_checksum "${clone}8" $filesize test_file_integrity $original_checksum "${clone}9" $filesize - verify_pool_props $dsize 6 + verify_pool_props $oused $osaved $dsize 6 rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9" @@ -276,11 +281,11 @@ function bclone_test test_file_integrity $original_checksum "${clone}6" $filesize - verify_pool_props $dsize 1 + verify_pool_props $oused $osaved $dsize 1 rm -f "${clone}6" sync_pool $TESTPOOL - verify_pool_props $dsize 1 + verify_pool_props $oused $osaved $dsize 1 } diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib index ddfbfc999c4e..aeb8efe91715 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib @@ -66,7 +66,7 @@ function bclone_corner_cases_init export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0) export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1) export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2) - export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest) + export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest) export FIRST_HALF_CHECKSUM="" export SECOND_HALF_CHECKSUM="" } @@ -210,6 +210,8 @@ function bclone_corner_cases_test typeset -r dstdir=$2 typeset limit=$3 typeset -i count=0 + typeset oused + typeset osaved if [[ $srcdir != "count" ]]; then if [[ -n "$limit" ]]; then @@ -217,6 +219,11 @@ function bclone_corner_cases_test limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs) fi bclone_corner_cases_init $srcdir $dstdir + + # Save current block cloning stats for later use. + sync_pool $TESTPOOL + oused=$(get_pool_prop bcloneused $TESTPOOL) + osaved=$(get_pool_prop bclonesaved $TESTPOOL) fi # @@ -285,21 +292,24 @@ function bclone_corner_cases_test overwrite_clone "$second_overwrite" if checksum_compare $read_after; then - log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after" else - log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after" fi log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL if checksum_compare "yes"; then - log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg" else - log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg" fi rm -f "$CLONE" + sync_pool $TESTPOOL + verify_pool_prop_eq bcloneused $oused + verify_pool_prop_eq bclonesaved $osaved done done done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg index 06d25faf0356..739baf16086a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg @@ -31,6 +31,7 @@ export mountcmd=mount export mountforce="$mountcmd -f" export mountall="$mountcmd -a" +export mountrecursive="$mountcmd -R" export unmountcmd=unmount export unmountforce="$unmountcmd -f" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh new file mode 100755 index 000000000000..0e5cc5d6955e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh @@ -0,0 +1,146 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2024, iXsystems Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount -R functionality. +# +# STRATEGY: +# 1. Create nested datasets +# 2. Unmount all datasets +# 3. Recusrively mount root datasets, this should mount all datasets +# present in a pool +# 4. Unmount all datasets +# 5. Recusrsively mount child datasets with children. This should mount +# child datasets, but not the root dataset or parent datasets +# 6. Unmount all datasets +# 7. Mount root dataset recursively again and confirm all child +# datasets are mounted. +# + +verify_runnable "both" + +function cleanup +{ + log_must datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -R + log_must datasetexists $TESTPOOL/$TESTFS2 && \ + destroy_dataset $TESTPOOL/$TESTFS2 -R + log_must datasetexists $TESTPOOL/$TESTFS3 && \ + destroy_dataset $TESTPOOL/$TESTFS3 -R +} + +function setup_all +{ + log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS + log_must zfs create $TESTPOOL/$TESTFS1 + log_must zfs create $TESTPOOL/$TESTFS2 + log_must zfs create $TESTPOOL/$TESTFS3 + log_must zfs create $TESTPOOL/$TESTFS2/child1 + log_must zfs create $TESTPOOL/$TESTFS2/child2 + log_must zfs create $TESTPOOL/$TESTFS2/child3 + log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild + log_must zfs create $TESTPOOL/$TESTFS3/child +} + +log_assert "Verify that 'zfs $mountrecursive' successfully, " \ + "mounts the dataset along with all its children." + +log_onexit cleanup + +log_must setup_all + +log_must zfs $unmountall + +log_must zfs $mountrecursive $TESTPOOL + +log_must mounted $TESTPOOL +log_must mounted $TESTPOOL/$TESTFS +log_must mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3 + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2 + +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_pass "'zfs $mountrecursive' behaves as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh new file mode 100755 index 000000000000..dd9ef9ddd229 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +# +# TODO: this only checks that the set of valid device fault types. It should +# check all the other options, and that they work, and everything really. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Check zinject parameters." + +log_onexit cleanup + +DISK1=${DISKS%% *} + +function cleanup +{ + zinject -c all + default_cleanup_noexit +} + +function test_device_fault +{ + typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop") + for e in ${errno[@]}; do + log_must eval \ + "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL" + done + zinject -c all +} + +default_mirror_setup_noexit $DISKS + +test_device_fault + +log_pass "zinject parameters work as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 7ecaf849e44b..51871934dd22 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -22,7 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. -# Copyright (c) 2020 by Delphix. All rights reserved. +# Copyright (c) 2020, 2024 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2 logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) +opt="" typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do + # + # Need to add the --allow-ashift-mismatch option to disable the + # ashift mismatch checks in zpool add. + # + if [[ $ashift -eq $orig_ashift ]]; then + opt="" + else + opt="--allow-ashift-mismatch" + fi + log_must zpool create $TESTPOOL $disk1 - log_must zpool add -o ashift=$ashift $TESTPOOL $disk2 + log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2 log_must verify_ashift $disk2 $ashift # clean things for the next run @@ -78,7 +89,7 @@ do # log_must zpool create $TESTPOOL $disk1 log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift - log_must zpool add $TESTPOOL $disk2 + log_must zpool add $opt $TESTPOOL $disk2 exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) log_must verify_ashift $disk2 $exp diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh index 228f62232aae..6a3283d0618f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh @@ -22,7 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. -# Copyright (c) 2020 by Delphix. All rights reserved. +# Copyright (c) 2020, 2024 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do + if [ $ashift -eq $orig_ashift ];then + opt="" + else + opt="--allow-ashift-mismatch" + fi log_must zpool create -o ashift=$ashift $TESTPOOL $disk1 - log_must zpool add $TESTPOOL $disk2 + log_must zpool add $opt $TESTPOOL $disk2 log_must verify_ashift $disk2 $ashift # clean things for the next run @@ -82,8 +87,13 @@ for ashift in ${ashifts[@]} do for cmdval in ${ashifts[@]} do + if [ $ashift -eq $cmdval ];then + opt="" + else + opt="--allow-ashift-mismatch" + fi log_must zpool create -o ashift=$ashift $TESTPOOL $disk1 - log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2 + log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2 log_must verify_ashift $disk2 $cmdval # clean things for the next run diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh new file mode 100755 index 000000000000..e69de29bb2d1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh index c5c06f76340b..afee34a33469 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh @@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2 log_must zpool add -f $TESTPOOL $DISK2 log_must vdevs_in_pool $TESTPOOL $DISK2 +log_must zpool destroy $TESTPOOL + +create_pool $TESTPOOL mirror $DISK0 $DISK1 +log_must poolexists $TESTPOOL + +log_mustnot zpool add $TESTPOOL $DISK2 +log_mustnot vdevs_in_pool $TESTPOOL $DISK2 + +log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2 +log_must vdevs_in_pool $TESTPOOL $DISK2 + log_pass "'zpool add -f ...' executes successfully." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh index 646edc1a4557..cecda56ab125 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh @@ -70,7 +70,7 @@ if is_freebsd; then recursive=$(get_tunable VOL_RECURSIVE) log_must set_tunable64 VOL_RECURSIVE 1 fi -log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL +log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh index 4990ef9d29b0..0e9d9f5f030f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh @@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1 unset NOINUSE_CHECK log_mustnot zpool add -f $TESTPOOL $DISK1 +log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1 log_mustnot zpool add -f $TESTPOOL $mnttab_dev +log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev if is_linux; then log_mustnot zpool add $TESTPOOL $vfstab_dev else diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh index d7f3a900e8fd..a13a27160e76 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh @@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0 for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache" do log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1 log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1 done log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh index b8b25db1b9f9..22860e9caf1d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh @@ -138,7 +138,7 @@ function zpool_create_forced_add while ((j < ${#add_args[@]})); do log_must zpool create $TESTPOOL1 ${create_args[$i]} log_mustnot zpool add $TESTPOOL1 ${add_args[$j]} - log_must zpool add -f $TESTPOOL1 ${add_args[$j]} + log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]} log_must zpool destroy -f $TESTPOOL1 ((j += 1)) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh new file mode 100755 index 000000000000..c681d1b7dd23 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh @@ -0,0 +1,165 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that admin commands to different pool are not blocked by import +# +# STRATEGY: +# 1. Create 2 pools +# 2. Export one of the pools +# 4. Import the pool with an injected delay +# 5. Execute some admin commands against both pools +# 6. Verify that the admin commands to the non-imported pool don't stall +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL1 + destroy_pool $TESTPOOL2 +} + +function pool_import +{ + typeset dir=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: imported in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-import + else + echo ${pool}: import failed $errmsg in $SECONDS secs + fi +} + +function pool_add_device +{ + typeset pool=$1 + typeset device=$2 + typeset devtype=$3 + + SECONDS=0 + errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: added $devtype vdev in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-add + else + echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_stats +{ + typeset stats=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool $stats $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: $stats in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats} + else + echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_create +{ + typeset pool=$1 + typeset device=$2 + + SECONDS=0 + errmsg=$(zpool create $pool $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: created in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-create + else + echo ${pool}: create failed ${errmsg}, in $SECONDS secs + fi +} + +log_assert "Simple admin commands to different pool not blocked by import" + +log_onexit cleanup + +# +# create two pools and export one +# +log_must zpool create $TESTPOOL1 $VDEV0 +log_must zpool export $TESTPOOL1 +log_must zpool create $TESTPOOL2 $VDEV1 + +# +# import pool asyncronously with an injected 10 second delay +# +log_must zinject -P import -s 10 $TESTPOOL1 +pool_import $DEVICE_DIR $TESTPOOL1 & + +sleep 2 + +# +# run some admin commands on the pools while the import is in progress +# + +pool_add_device $TESTPOOL1 $VDEV2 "log" & +pool_add_device $TESTPOOL2 $VDEV3 "cache" & +pool_stats "status" $TESTPOOL1 & +pool_stats "status" $TESTPOOL2 & +pool_stats "list" $TESTPOOL1 & +pool_stats "list" $TESTPOOL2 & +pool_create $TESTPOOL1 $VDEV4 & +wait + +log_must zpool sync $TESTPOOL1 $TESTPOOL2 + +zpool history $TESTPOOL1 +zpool history $TESTPOOL2 + +log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import) + +# +# verify that commands to second pool did not wait for import to finish +# +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add) +[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create" + +log_pass "Simple admin commands to different pool not blocked by import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh new file mode 100755 index 000000000000..339dc2575ede --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that pool imports by same name only have one winner +# +# STRATEGY: +# 1. Create 4 single disk pools with the same name +# 2. Generate some ZIL records (for a longer import) +# 3. Export the pools +# 4. Import the pools in parallel +# 5. Repeat with using matching guids +# + +verify_runnable "global" + +POOLNAME="import_pool" +DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME" +VDEVSIZE=$((512 * 1024 * 1024)) + +log_assert "parallel pool imports by same name only have one winner" + +# each pool has its own device directory +for i in {0..3}; do + log_must mkdir -p ${DEV_DIR_PREFIX}$i + log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i +done + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $POOLNAME + + log_must rm -rf $DEV_DIR_PREFIX* +} + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + +function import_pool +{ + typeset dir=$1 + typeset pool=$2 + typeset newname=$3 + + SECONDS=0 + errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + touch $dir/imported + echo "imported $pool in $SECONDS secs" + elif [[ $errmsg == *"cannot import"* ]]; then + echo "pool import failed: $errmsg, waited $SECONDS secs" + touch $dir/failed + fi +} + +# +# create four exported pools with the same name +# +for i in {0..3}; do + log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i + log_must zpool export $POOLNAME +done +log_must zinject -P import -s 10 $POOLNAME + +# +# import the pools in parallel, expecting only one winner +# +for i in {0..3}; do + import_pool ${DEV_DIR_PREFIX}$i $POOLNAME & +done +wait + +# check the result of background imports +typeset num_imports=0 +typeset num_cannot=0 +for i in {0..3}; do + if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then + ((num_imports += 1)) + fi + if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then + ((num_cannot += 1)) + loser=$i + fi +done +[[ $num_imports -eq "1" ]] || log_fail "expecting an import" +[[ $num_cannot -eq "3" ]] || \ + log_fail "expecting 3 pool exists errors, found $num_cannot" + +log_note "$num_imports imported and $num_cannot failed (expected)" + +log_pass "parallel pool imports by same name only have one winner" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh new file mode 100755 index 000000000000..71b2437a37ec --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +export MAX_NUM=8 + +# +# DESCRIPTION: +# Verify that pool imports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Generate some ZIL records +# 3. Export the pools +# 4. Import half of the pools synchronously to baseline sequential cost +# 5. Import the other half asynchronously to demonstrate parallel savings +# 6. Export 4 pools +# 7. Test zpool import -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# +VDEVSIZE=$((512 * 1024 * 1024)) +increase_device_sizes $VDEVSIZE + +POOLNAME="import_pool" + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + for i in {0..$(($MAX_NUM - 1))}; do + destroy_pool $POOLNAME-$i + done + # reset the devices + increase_device_sizes 0 + increase_device_sizes $FILE_SIZE +} + +log_assert "Pool imports can occur in parallel" + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + + +# +# create some exported pools with import delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# import half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially imported 4 pools in $sequential_time seconds" + +# +# import half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# export pools with import delay injectors +# +for i in {4..7}; do + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# now test zpool import -a +# +SECONDS=0 +log_must zpool import -a -d $DEVICE_DIR -f +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool imports occur in parallel" diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh index 60817449ab03..4db968ffae05 100755 --- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -76,7 +76,7 @@ log_onexit cleanup SRC_FILE=src.data DST_FILE=dst.data -SRC_SIZE=$(($RANDOM % 2048)) +SRC_SIZE=$((1024 + $RANDOM % 1024)) # A smaller recordsize is used merely to speed up the test. RECORDSIZE=4096 @@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do # Overwrite a random range of an existing file and immediately copy it. sync_pool $TESTPOOL log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ - seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc if [[ "$mode" == "always" ]]; then log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE log_must ls -l $CP_TESTDIR @@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do # Overwrite a random range of an existing file and immediately copy it. log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ - seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc log_must cp --reflink=$mode $SRC_FILE $DST_FILE verify_copy $SRC_FILE $DST_FILE log_must rm -f $SRC_FILE $DST_FILE diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 000000000000..8b118684aa7f --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpected errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config lock as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to acquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + poolexists $MMP_POOL && destroy_pool $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +clear_mmp_history + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +sync_pool $MMP_POOL + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend" diff --git a/tests/zfs-tests/tests/functional/quota/quota.kshlib b/tests/zfs-tests/tests/functional/quota/quota.kshlib index 5083415c8956..b4cfde020f3f 100644 --- a/tests/zfs-tests/tests/functional/quota/quota.kshlib +++ b/tests/zfs-tests/tests/functional/quota/quota.kshlib @@ -95,3 +95,10 @@ function exceed_quota log_fail "Returned error code: $zret. Expected: $EDQUOT." return 0 } + +function reset_quota +{ + typeset FILESYSTEM="$1" + + log_must zfs set quota=none $FILESYSTEM +} diff --git a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh index d124cb26ae98..f01008a46bb1 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh @@ -64,6 +64,8 @@ function cleanup # wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh index 3af005e874e9..bea2a5a68691 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh @@ -64,6 +64,8 @@ function cleanup wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh index de265813d55b..33f6421131fc 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh @@ -67,6 +67,8 @@ function cleanup # wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTCTR/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh index 8f20b533da68..682d09f080a4 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh @@ -65,6 +65,8 @@ function cleanup wait_freeing $TESTPOOL sync_pool $TESTPOOL + + reset_quota $TESTPOOL/$TESTCTR/$TESTFS1 } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh index 98ee4edae650..9c4db81ca270 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh @@ -50,20 +50,19 @@ function cleanup { datasetexists $fs_child && destroy_dataset $fs_child - log_must zfs set quota=$quota_val $fs + reset_quota $fs } log_onexit cleanup -log_assert "Verify that quota doesnot inherit its value from parent." -log_onexit cleanup +log_assert "Verify that quota does not inherit its value from parent." fs=$TESTPOOL/$TESTFS fs_child=$TESTPOOL/$TESTFS/$TESTFS space_avail=$(get_prop available $fs) quota_val=$(get_prop quota $fs) -typeset -i quotasize=$space_avail +typeset -li quotasize=$space_avail ((quotasize = quotasize * 2 )) log_must zfs set quota=$quotasize $fs @@ -72,4 +71,4 @@ quota_space=$(get_prop quota $fs_child) [[ $quota_space == $quotasize ]] && \ log_fail "The quota of child dataset inherits its value from parent." -log_pass "quota doesnot inherit its value from parent as expected." +log_pass "quota does not inherit its value from parent as expected." diff --git a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh index 12105162c5b5..111d771188e3 100755 --- a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh +++ b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh @@ -50,7 +50,7 @@ log_assert "Verify cannot set quota lower than the space currently in use" function cleanup { - log_must zfs set quota=none $TESTPOOL/$TESTFS + reset_quota $TESTPOOL/$TESTFS } log_onexit cleanup diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh index 37ef84b72377..e6ad25f23f93 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh @@ -44,8 +44,6 @@ user_ns_cleanup() { log_must zfs destroy -r "$TESTPOOL/userns" } -log_onexit user_ns_cleanup - log_assert "Check zfs zone command handling of non-namespace files" # Pass if user namespaces are not supported. @@ -54,6 +52,8 @@ if [ "$?" -ne "0" ]; then log_unsupported "Failed to create user namespace" fi +log_onexit user_ns_cleanup + # Create the baseline datasets. log_must zfs create -o zoned=on "$TESTPOOL/userns" diff --git a/tests/zfs-tests/tests/functional/vdev_disk/.gitignore b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore new file mode 100644 index 000000000000..27653e5924fc --- /dev/null +++ b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore @@ -0,0 +1 @@ +page_alignment diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c new file mode 100644 index 000000000000..5c6d28eb2c44 --- /dev/null +++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c @@ -0,0 +1,413 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include + +/* + * This tests the vdev_disk page alignment check callback + * vdev_disk_check_pages_cb(). For now, this test includes a copy of that + * function from module/os/linux/zfs/vdev_disk.c. If you change it here, + * remember to change it there too, and add tests data here to validate the + * change you're making. + */ + +struct page; + +typedef struct { + uint32_t bmask; + uint32_t npages; + uint32_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + (void) page; + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = (off+len) & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +typedef struct { + /* test name */ + const char *name; + + /* blocks size mask */ + uint32_t mask; + + /* amount of data to take */ + size_t size; + + /* [start offset in page, len to end of page or size] */ + size_t pages[16][2]; +} page_test_t; + +static const page_test_t valid_tests[] = { + /* 512B block tests */ + { + "512B blocks, 4K single page", + 0x1ff, 0x1000, { + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 1K at start of page", + 0x1ff, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 1K at end of page", + 0x1ff, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "512B blocks, 1K within page, 512B start offset", + 0x1ff, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, { + "512B blocks, 8K across 2x4K pages", + 0x1ff, 0x2000, { + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 4K across two pages, 2K start offset", + 0x1ff, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "512B blocks, 16K across 5x4K pages, 512B start offset", + 0x1ff, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "512B blocks, 64K data, 8x8K compound pages", + 0x1ff, 0x10000, { + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + }, + }, { + "512B blocks, 64K data, 9x8K compound pages, 512B start offset", + 0x1ff, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "512B blocks, 64K data, 2x16K compound pages, 8x4K pages", + 0x1ff, 0x10000, { + { 0x0, 0x8000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "512B blocks, 64K data, mixed 4K/8K/16K pages", + 0x1ff, 0x10000, { + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + }, + }, { + "512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 0x1ff, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + /* 4K block tests */ + { + "4K blocks, 4K single page", + 0xfff, 0x1000, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K at start of page", + 0xfff, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K at end of page", + 0xfff, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "4K blocks, 1K within page, 512B start offset", + 0xfff, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, { + "4K blocks, 8K across 2x4K pages", + 0xfff, 0x2000, { + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 4K across two pages, 2K start offset", + 0xfff, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "4K blocks, 16K across 5x4K pages, 512B start offset", + 0xfff, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 8x8K compound pages", + 0xfff, 0x10000, { + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + }, + }, { + "4K blocks, 64K data, 9x8K compound pages, 512B start offset", + 0xfff, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 2x16K compound pages, 8x4K pages", + 0xfff, 0x10000, { + { 0x0, 0x8000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages", + 0xfff, 0x10000, { + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 0xfff, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + { 0 }, +}; + +static const page_test_t invalid_tests[] = { + { + "512B blocks, 16K data, 512 leader (gang block simulation)", + 0x1ff, 0x8000, { + { 0x0, 0x0200 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0c00 }, + }, + }, { + "4K blocks, 32K data, 2 incompatible spans " + "(gang abd simulation)", + 0xfff, 0x8000, { + { 0x0800, 0x0800 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0800 }, + { 0x0800, 0x0800 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0800 }, + }, + }, + { 0 }, +}; + +static bool +run_test(const page_test_t *test, bool verbose) +{ + size_t rem = test->size; + + vdev_disk_check_pages_t s = { + .bmask = 0xfff, + .npages = 0, + .end = 0, + }; + + for (int i = 0; test->pages[i][1] > 0; i++) { + size_t off = test->pages[i][0]; + size_t len = test->pages[i][1]; + + size_t take = MIN(rem, len); + + if (verbose) + printf(" page %d [off %lx len %lx], " + "rem %lx, take %lx\n", + i, off, len, rem, take); + + if (vdev_disk_check_pages_cb(NULL, off, take, &s)) { + if (verbose) + printf(" ABORT: misalignment detected, " + "rem %lx\n", rem); + return (false); + } + + rem -= take; + if (rem == 0) + break; + } + + if (rem > 0) { + if (verbose) + printf(" ABORT: ran out of pages, rem %lx\n", rem); + return (false); + } + + return (true); +} + +static void +run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed) +{ + for (const page_test_t *test = &tests[0]; test->name; test++) { + bool pass = (run_test(test, false) == want); + if (pass) { + printf("%s: PASS\n", test->name); + (*npassed)++; + } else { + printf("%s: FAIL [expected %s, got %s]\n", test->name, + want ? "VALID" : "INVALID", + want ? "INVALID" : "VALID"); + run_test(test, true); + } + (*ntests)++; + } +} + +int main(void) { + int ntests = 0, npassed = 0; + + run_test_set(valid_tests, true, &ntests, &npassed); + run_test_set(invalid_tests, false, &ntests, &npassed); + + printf("\n%d/%d tests passed\n", npassed, ntests); + + return (ntests == npassed ? 0 : 1); +} diff --git a/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh new file mode 100755 index 000000000000..42fe70042d6a --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh new file mode 100755 index 000000000000..b756d4e76c83 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh new file mode 100755 index 000000000000..4dbf579b8ac7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh @@ -0,0 +1,81 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2024, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a large number of files in a directory. Then remove all files and +# check that the directory zap was shrunk. Use zdb to check that the zap object +# contains only one leaf block using zdb. +# + +verify_runnable "global" + +DIR=largedir + +NR_FILES=100000 +BATCH=1000 +CWD=$PWD + +log_assert "Create a large number of files ($NR_FILES) in a directory. " \ + "Make sure that the directory ZAP object was shrunk." + +log_must mkdir $TESTDIR/$DIR + +cd $TESTDIR/$DIR +# In order to prevent arguments overflowing, create NR_FILES in BATCH at once. +for i in $(seq $(($NR_FILES/$BATCH))); do + touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))); +done +cd $CWD + +log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l) + +# remove all files in $DIR directory +cd $TESTDIR/$DIR +for i in $(seq $(($NR_FILES/$BATCH))); do + rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))) +done +cd $CWD +sync_pool $TESTPOOL + +log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l) + +# check whether zap_shrink works +zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR) +nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}') +log_must test 1 -eq $nleafs + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# check whether zap_shrink works +zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR) +nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}') +log_must test 1 -eq $nleafs + +log_pass diff --git a/udev/zvol_id.c b/udev/zvol_id.c index 5960b978787a..609349594767 100644 --- a/udev/zvol_id.c +++ b/udev/zvol_id.c @@ -51,7 +51,7 @@ const char *__asan_default_options(void) { int main(int argc, const char *const *argv) { - if (argc != 2) { + if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) { fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]); return (1); } @@ -72,9 +72,10 @@ main(int argc, const char *const *argv) return (1); } - unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS; - if (dev_part != 0) - sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part); + const char *dev_part = strrchr(dev_name, 'p'); + if (dev_part != NULL) { + sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1); + } for (size_t i = 0; i < strlen(zvol_name); ++i) if (isblank(zvol_name[i]))