From d8695e99955f1d9b6407bff2ef5611e9c97ddf3d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 6 Jul 2015 05:20:31 +0200 Subject: [PATCH] Illumos 5746 more checksumming in zfs send Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Bayard Bell Approved by: Albert Lee References: https://github.com/illumos/illumos-gate/commit/98110f08fa182032082d98be2ddb9391fcd62bf1 https://www.illumos.org/issues/5746 https://github.com/zfsonlinux/zfs/issues/905 (Add "zstreamdump" integrity check program #905) https://github.com/zfsonlinux/zfs/commit/b79fc3fea9e9f26e0e65665243d569d8907ac279 (Add zstreamdump(8) command to examine ZFS send streams.) Porting notes: Some of those changes were somewhat hard to track due to: https://github.com/zfsonlinux/zfs/commit/2024041b6c5134a925a33c10eff24a47ecb541a6 Remove superfluous statement https://github.com/zfsonlinux/zfs/commit/044baf009aac4935eca0f96477eb3c43e95d758a Use taskq for dump_bytes() https://github.com/zfsonlinux/zfs/commit/88904bb3e3f4a385108343aee1ac7ee0d83e25dc Illumos 5162 - zfs recv should use loaned arc buffer to avoid copy kmem_alloc calls were changed to vmem_alloc in accordance with https://github.com/zfsonlinux/zfs/commit/77aef6f60ea29f6d3769addc778db6328ac85755 (Use vmem_alloc() for nvlists) and the kmem-rework account for ISO C90 warnings (-Werror=declaration-after-statement) account for error: format '%llx' expects argument of type 'long long unsigned int', but argument X has type 'uint64_t' [-Werror=format=] arc_buf_t *abuf; dmu_buf_t *bonus; zio_cksum_t cksum_orig; zio_cksum_t *cksump; Fix whitespace change (void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n", (unsigned)size); to (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", size); and to account for long unsigned int & match upstream Ported-by: kernelOfTruth kerneloftruth@gmail.com --- cmd/zstreamdump/zstreamdump.c | 60 +++- include/sys/spa.h | 13 + include/sys/zfs_ioctl.h | 18 +- include/sys/zio_checksum.h | 7 +- lib/libzfs/libzfs_sendrecv.c | 143 +++++----- module/zfs/dmu_send.c | 509 +++++++++++++++++++--------------- 6 files changed, 444 insertions(+), 306 deletions(-) diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index 176dd66b268a..52db18333f0b 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -27,7 +27,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #include @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -73,8 +74,8 @@ safe_malloc(size_t size) { void *rv = malloc(size); if (rv == NULL) { - (void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n", - (unsigned)size); + (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", + size); abort(); } return (rv); @@ -85,7 +86,6 @@ safe_malloc(size_t size) * * Read while computing incremental checksum */ - static size_t ssread(void *buf, size_t len, zio_cksum_t *cksum) { @@ -94,7 +94,7 @@ ssread(void *buf, size_t len, zio_cksum_t *cksum) if ((outlen = fread(buf, len, 1, send_stream)) == 0) return (0); - if (do_cksum && cksum) { + if (do_cksum) { if (do_byteswap) fletcher_4_incremental_byteswap(buf, len, cksum); else @@ -104,6 +104,34 @@ ssread(void *buf, size_t len, zio_cksum_t *cksum) return (outlen); } +static size_t +read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + zio_cksum_t saved_cksum = *cksum; + r = ssread(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && + !ZIO_CHECKSUM_EQUAL(saved_cksum, + drr->drr_u.drr_checksum.drr_checksum)) { + fprintf(stderr, "invalid checksum\n"); + (void) printf("Incorrect checksum in record header.\n"); + (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n", + (long long unsigned int)saved_cksum.zc_word[0], + (long long unsigned int)saved_cksum.zc_word[1], + (long long unsigned int)saved_cksum.zc_word[2], + (long long unsigned int)saved_cksum.zc_word[3]); + exit(1); + } + return (sizeof (*drr)); +} + /* * Print part of a block in ASCII characters */ @@ -185,8 +213,10 @@ main(int argc, char *argv[]) struct drr_free *drrf = &thedrr.drr_u.drr_free; struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; + struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; char c; boolean_t verbose = B_FALSE; + boolean_t very_verbose = B_FALSE; boolean_t first = B_TRUE; /* * dump flag controls whether the contents of any modified data blocks @@ -204,11 +234,14 @@ main(int argc, char *argv[]) do_cksum = B_FALSE; break; case 'v': + if (verbose) + very_verbose = B_TRUE; verbose = B_TRUE; break; case 'd': dump = B_TRUE; verbose = B_TRUE; + very_verbose = B_TRUE; break; case ':': (void) fprintf(stderr, @@ -231,7 +264,7 @@ main(int argc, char *argv[]) } send_stream = stdin; - while (ssread(drr, sizeof (dmu_replay_record_t), &zc)) { + while (read_hdr(drr, &zc)) { /* * If this is the first DMU record being processed, check for @@ -437,7 +470,7 @@ main(int argc, char *argv[]) if (verbose) { (void) printf("WRITE object = %llu type = %u " "checksum type = %u\n" - "offset = %llu length = %llu " + " offset = %llu length = %llu " "props = %llx\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, @@ -481,9 +514,9 @@ main(int argc, char *argv[]) if (verbose) { (void) printf("WRITE_BYREF object = %llu " "checksum type = %u props = %llx\n" - "offset = %llu length = %llu\n" + " offset = %llu length = %llu\n" "toguid = %llx refguid = %llx\n" - "refobject = %llu refoffset = %llu\n", + " refobject = %llu refoffset = %llu\n", (u_longlong_t)drrwbr->drr_object, drrwbr->drr_checksumtype, (u_longlong_t)drrwbr->drr_key.ddk_prop, @@ -544,7 +577,7 @@ main(int argc, char *argv[]) if (verbose) { (void) printf("WRITE_EMBEDDED object = %llu " "offset = %llu length = %llu\n" - "toguid = %llx comp = %u etype = %u " + " toguid = %llx comp = %u etype = %u " "lsize = %u psize = %u\n", (u_longlong_t)drrwe->drr_object, (u_longlong_t)drrwe->drr_offset, @@ -562,6 +595,13 @@ main(int argc, char *argv[]) /* should never be reached */ exit(1); } + if (drr->drr_type != DRR_BEGIN && very_verbose) { + (void) printf(" checksum = %llx/%llx/%llx/%llx\n", + (longlong_t)drrc->drr_checksum.zc_word[0], + (longlong_t)drrc->drr_checksum.zc_word[1], + (longlong_t)drrc->drr_checksum.zc_word[2], + (longlong_t)drrc->drr_checksum.zc_word[3]); + } pcksum = zc; } free(buf); diff --git a/include/sys/spa.h b/include/sys/spa.h index 5dc9084dad6b..c80e8337ea3a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -446,6 +446,19 @@ _NOTE(CONSTCOND) } while (0) ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ((zc1).zc_word[3] - (zc2).zc_word[3]))) +#define ZIO_CHECKSUM_IS_ZERO(zc) \ + (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ + (zc)->zc_word[2] | (zc)->zc_word[3])) + +#define ZIO_CHECKSUM_BSWAP(zcp) \ +{ \ + (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ + (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ + (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ + (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ +} + + #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 09a96c043bf0..601a9a70c580 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H @@ -237,6 +237,22 @@ typedef struct dmu_replay_record { uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; + + /* + * Nore: drr_checksum is overlaid with all record types + * except DRR_BEGIN. Therefore its (non-pad) members + * must not overlap with members from the other structs. + * We accomplish this by putting its members at the very + * end of the struct. + */ + struct drr_checksum { + uint64_t drr_pad[34]; + /* + * fletcher-4 checksum of everything preceding the + * checksum. + */ + zio_cksum_t drr_checksum; + } drr_checksum; } drr_u; } dmu_replay_record_t; diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index de89bc9a7967..56b83b559377 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -34,13 +35,13 @@ extern "C" { /* * Signature for checksum functions. */ -typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); +typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *); /* * Information about each checksum function. */ typedef const struct zio_checksum_info { - zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ int ci_correctable; /* number of correctable bits */ int ci_eck; /* uses zio embedded checksum? */ int ci_dedup; /* strong enough for dedup? */ @@ -61,7 +62,7 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; +extern zio_checksum_func_t zio_checksum_SHA256; extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index bd2fd294706d..b35428f907cd 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -187,10 +187,28 @@ ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, } static int -cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd) +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) { - fletcher_4_incremental_native(buf, len, zc); - return (write(outfd, buf, len)); + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); } /* @@ -217,26 +235,18 @@ cksummer(void *arg) char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; - struct drr_begin *drrb = &thedrr.drr_u.drr_begin; - struct drr_end *drre = &thedrr.drr_u.drr_end; - struct drr_object *drro = &thedrr.drr_u.drr_object; - struct drr_write *drrw = &thedrr.drr_u.drr_write; - struct drr_spill *drrs = &thedrr.drr_u.drr_spill; - struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; FILE *ofp; int outfd; - dmu_replay_record_t wbr_drr = {0}; - struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; dedup_table_t ddt; zio_cksum_t stream_cksum; uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t numbuckets; ddt.max_ddt_size = - MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100, - SMALLEST_POSSIBLE_MAX_DDT_MB<<20); + MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, + SMALLEST_POSSIBLE_MAX_DDT_MB << 20); - numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t)); + numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); /* * numbuckets must be a power of 2. Increase number to @@ -252,32 +262,29 @@ cksummer(void *arg) ddt.numhashbits = high_order_bit(numbuckets) - 1; ddt.ddt_full = B_FALSE; - /* Initialize the write-by-reference block. */ - wbr_drr.drr_type = DRR_WRITE_BYREF; - wbr_drr.drr_payloadlen = 0; - outfd = dda->outputfd; ofp = fdopen(dda->inputfd, "r"); - while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) { + while (ssread(drr, sizeof (*drr), ofp) != 0) { switch (drr->drr_type) { case DRR_BEGIN: { - int fflags; + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int fflags; + int sz = 0; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + /* set the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { - int sz = drr->drr_payloadlen; + sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { buf = zfs_realloc(dda->dedup_hdl, buf, @@ -286,64 +293,60 @@ cksummer(void *arg) (void) ssread(buf, sz, ofp); if (ferror(stdin)) perror("fread"); - if (cksum_and_write(buf, sz, &stream_cksum, - outfd) == -1) - goto out; } + if (dump_record(drr, buf, sz, &stream_cksum, + outfd) != 0) + goto out; break; } case DRR_END: { + struct drr_end *drre = &drr->drr_u.drr_end; /* use the recalculated checksum */ - ZIO_SET_CHECKSUM(&drre->drr_checksum, - stream_cksum.zc_word[0], stream_cksum.zc_word[1], - stream_cksum.zc_word[2], stream_cksum.zc_word[3]); - if ((write(outfd, drr, - sizeof (dmu_replay_record_t))) == -1) + drre->drr_checksum = stream_cksum; + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } case DRR_OBJECT: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; + struct drr_object *drro = &drr->drr_u.drr_object; if (drro->drr_bonuslen > 0) { (void) ssread(buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), ofp); - if (cksum_and_write(buf, - P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), - &stream_cksum, outfd) == -1) - goto out; } + if (dump_record(drr, buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + &stream_cksum, outfd) != 0) + goto out; break; } case DRR_SPILL: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; + struct drr_spill *drrs = &drr->drr_u.drr_spill; (void) ssread(buf, drrs->drr_length, ofp); - if (cksum_and_write(buf, drrs->drr_length, - &stream_cksum, outfd) == -1) + if (dump_record(drr, buf, drrs->drr_length, + &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREEOBJECTS: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } case DRR_WRITE: { + struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; (void) ssread(buf, drrw->drr_length, ofp); @@ -380,7 +383,13 @@ cksummer(void *arg) if (ddt_update(dda->dedup_hdl, &ddt, &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, &dataref)) { + dmu_replay_record_t wbr_drr = {0}; + struct drr_write_byref *wbr_drrr = + &wbr_drr.drr_u.drr_write_byref; + /* block already present in stream */ + wbr_drr.drr_type = DRR_WRITE_BYREF; + wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; wbr_drrr->drr_length = drrw->drr_length; @@ -400,19 +409,13 @@ cksummer(void *arg) wbr_drrr->drr_key.ddk_prop = drrw->drr_key.ddk_prop; - if (cksum_and_write(&wbr_drr, - sizeof (dmu_replay_record_t), &stream_cksum, - outfd) == -1) + if (dump_record(&wbr_drr, NULL, 0, + &stream_cksum, outfd) != 0) goto out; } else { /* block not previously seen */ - if (cksum_and_write(drr, - sizeof (dmu_replay_record_t), &stream_cksum, - outfd) == -1) - goto out; - if (cksum_and_write(buf, - drrw->drr_length, - &stream_cksum, outfd) == -1) + if (dump_record(drr, buf, drrw->drr_length, + &stream_cksum, outfd) != 0) goto out; } break; @@ -420,28 +423,27 @@ cksummer(void *arg) case DRR_WRITE_EMBEDDED: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; (void) ssread(buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); - if (cksum_and_write(buf, + if (dump_record(drr, buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), - &stream_cksum, outfd) == -1) + &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREE: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } default: - (void) printf("INVALID record type 0x%x\n", + (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); @@ -1491,18 +1493,11 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", zhp->zfs_name, tosnap); drr.drr_payloadlen = buflen; - err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); - /* write header nvlist */ - if (err != -1 && packbuf != NULL) { - err = cksum_and_write(packbuf, buflen, &zc, - outfd); - } + err = dump_record(&drr, packbuf, buflen, &zc, outfd); free(packbuf); - if (err == -1) { - err = errno; + if (err != 0) goto stderr_out; - } /* write end record */ bzero(&drr, sizeof (drr)); @@ -1736,6 +1731,8 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, int rv; int len = ilen; + assert(ilen <= SPA_MAXBLOCKSIZE); + do { rv = read(fd, cp, len); cp += rv; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b028e5ba4fa5..b8ecc24b3af7 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -75,7 +75,6 @@ dump_bytes_cb(void *arg) ssize_t resid; /* have to get resid to get detailed errno */ ASSERT0(dbi->dbi_len % 8); - fletcher_4_incremental_native(dbi->dbi_buf, dbi->dbi_len, &dsp->dsa_zc); dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, (caddr_t)dbi->dbi_buf, dbi->dbi_len, 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); @@ -110,6 +109,38 @@ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) return (dsp->dsa_err); } +/* + * For all record types except BEGIN, fill in the checksum (overlaid in + * drr_u.drr_checksum.drr_checksum). The checksum verifies everything + * up to the start of the checksum itself. + */ +static int +dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(dsp->dsa_drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &dsp->dsa_zc); + if (dsp->dsa_drr->drr_type != DRR_BEGIN) { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. + drr_checksum.drr_checksum)); + dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; + } + fletcher_4_incremental_native(&dsp->dsa_drr-> + drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), &dsp->dsa_zc); + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + return (SET_ERROR(EINTR)); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, + &dsp->dsa_zc); + if (dump_bytes(dsp, payload, payload_len) != 0) + return (SET_ERROR(EINTR)); + } + return (0); +} + static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) @@ -154,8 +185,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -178,8 +208,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, return (0); } else { /* not a continuation. Push out pending record */ - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -192,8 +221,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; @@ -225,12 +253,11 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } - /* write a DATA record */ + /* write a WRITE record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; @@ -256,9 +283,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_key.ddk_cksum = bp->blk_cksum; } - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) - return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz) != 0) + if (dump_record(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -272,8 +297,7 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (EINTR); dsp->dsa_pending_op = PENDING_NONE; } @@ -293,9 +317,7 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, decode_embedded_bp_compressed(bp, buf); - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } @@ -306,8 +328,7 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -319,9 +340,7 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) - return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz)) + if (dump_record(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -344,8 +363,7 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -359,8 +377,7 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) return (0); } else { /* can't be aggregated. Push out pending record */ - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -387,8 +404,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_bytes(dsp, dsp->dsa_drr, - sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } @@ -409,11 +425,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) - return (SET_ERROR(EINTR)); - - if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) + if (dump_record(dsp, DN_BONUS(dnp), + P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { return (SET_ERROR(EINTR)); + } /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * @@ -657,7 +672,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; - ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); dsp->dsa_featureflags = featureflags; @@ -669,7 +683,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, tag); - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { + if (dump_record(dsp, NULL, 0) != 0) { err = dsp->dsa_err; goto out; } @@ -678,7 +692,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, backup_cb, dsp); if (dsp->dsa_pending_op != PENDING_NONE) - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) + if (dump_record(dsp, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { @@ -692,7 +706,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { + if (dump_record(dsp, NULL, 0) != 0) { err = dsp->dsa_err; goto out; } @@ -1300,13 +1314,19 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, } struct restorearg { + objset_t *os; int err; boolean_t byteswap; vnode_t *vp; - char *buf; uint64_t voff; int bufsize; /* amount of memory allocated for buf */ + + dmu_replay_record_t *drr; + dmu_replay_record_t *next_drr; + char *buf; zio_cksum_t cksum; + zio_cksum_t prev_cksum; + avl_tree_t *guid_to_ds_map; }; @@ -1345,14 +1365,11 @@ free_guid_map_onexit(void *arg) kmem_free(ca, sizeof (avl_tree_t)); } -static void * -restore_read(struct restorearg *ra, int len, char *buf) +static int +restore_read(struct restorearg *ra, int len, void *buf) { int done = 0; - if (buf == NULL) - buf = ra->buf; - /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); ASSERT3U(len, <=, ra->bufsize); @@ -1361,7 +1378,7 @@ restore_read(struct restorearg *ra, int len, char *buf) ssize_t resid; ra->err = vn_rdwr(UIO_READ, ra->vp, - buf + done, len - done, + (char *)buf + done, len - done, ra->voff, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); @@ -1370,24 +1387,21 @@ restore_read(struct restorearg *ra, int len, char *buf) ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) - return (NULL); + return (ra->err); } ASSERT3U(done, ==, len); - if (ra->byteswap) - fletcher_4_incremental_byteswap(buf, len, &ra->cksum); - else - fletcher_4_incremental_native(buf, len, &ra->cksum); - return (buf); + return (0); } noinline static void -backup_byteswap(dmu_replay_record_t *drr) +byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); @@ -1417,10 +1431,7 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_write.drr_offset); DO64(drr_write.drr_length); DO64(drr_write.drr_toguid); - DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); break; case DRR_WRITE_BYREF: @@ -1431,10 +1442,8 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. + drr_key.ddk_cksum); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_WRITE_EMBEDDED: @@ -1457,15 +1466,17 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_spill.drr_toguid); break; case DRR_END: - DO64(drr_end.drr_checksum.zc_word[0]); - DO64(drr_end.drr_checksum.zc_word[1]); - DO64(drr_end.drr_checksum.zc_word[2]); - DO64(drr_end.drr_checksum.zc_word[3]); DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; default: break; } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + #undef DO64 #undef DO32 } @@ -1482,11 +1493,10 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } noinline static int -restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +restore_object(struct restorearg *ra, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; - void *data = NULL; uint64_t object; int err; @@ -1497,23 +1507,17 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } - err = dmu_object_info(os, drro->drr_object, &doi); + err = dmu_object_info(ra->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; - if (drro->drr_bonuslen) { - data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); - if (ra->err != 0) - return (ra->err); - } - /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file @@ -1527,14 +1531,14 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { - err = dmu_free_long_range(os, drro->drr_object, + err = dmu_free_long_range(ra->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } - tx = dmu_tx_create(os); + tx = dmu_tx_create(ra->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { @@ -1544,7 +1548,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(os, drro->drr_object, + err = dmu_object_claim(ra->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || @@ -1552,7 +1556,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(os, drro->drr_object, + err = dmu_object_reclaim(ra->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } @@ -1561,14 +1565,15 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) return (SET_ERROR(EINVAL)); } - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, - tx); - dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + dmu_object_set_checksum(ra->os, drro->drr_object, + drro->drr_checksumtype, tx); + dmu_object_set_compress(ra->os, drro->drr_object, + drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); @@ -1587,7 +1592,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) /* ARGSUSED */ noinline static int -restore_freeobjects(struct restorearg *ra, objset_t *os, +restore_freeobjects(struct restorearg *ra, struct drr_freeobjects *drrfo) { uint64_t obj; @@ -1597,13 +1602,13 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(os, &obj, FALSE, 0)) { + (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(os, obj, NULL) != 0) + if (dmu_object_info(ra->os, obj, NULL) != 0) continue; - err = dmu_free_long_object(os, obj); + err = dmu_free_long_object(ra->os, obj); if (err != 0) return (err); } @@ -1611,50 +1616,37 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, } noinline static int -restore_write(struct restorearg *ra, objset_t *os, - struct drr_write *drrw) +restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) { dmu_tx_t *tx; dmu_buf_t *bonus; - arc_buf_t *abuf; - void *data; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) + if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - abuf = dmu_request_arcbuf(bonus, drrw->drr_length); - - data = restore_read(ra, drrw->drr_length, abuf->b_data); - if (data == NULL) { - dmu_return_arcbuf(abuf); - dmu_buf_rele(bonus, FTAG); - return (ra->err); - } - - tx = dmu_tx_create(os); + tx = dmu_tx_create(ra->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { - dmu_return_arcbuf(abuf); - dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); return (err); } if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + drrw->drr_length); } + + if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) + return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); @@ -1669,8 +1661,7 @@ restore_write(struct restorearg *ra, objset_t *os, * data from the stream to fulfill this write. */ static int -restore_write_byref(struct restorearg *ra, objset_t *os, - struct drr_write_byref *drrwbr) +restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; @@ -1696,7 +1687,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { - ref_os = os; + ref_os = ra->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, @@ -1704,7 +1695,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, if (err != 0) return (err); - tx = dmu_tx_create(os); + tx = dmu_tx_create(ra->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); @@ -1713,7 +1704,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - dmu_write(os, drrwbr->drr_object, + dmu_write(ra->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); @@ -1721,12 +1712,11 @@ restore_write_byref(struct restorearg *ra, objset_t *os, } static int -restore_write_embedded(struct restorearg *ra, objset_t *os, - struct drr_write_embedded *drrwnp) +restore_write_embedded(struct restorearg *ra, + struct drr_write_embedded *drrwnp, void *data) { dmu_tx_t *tx; int err; - void *data; if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) return (EINVAL); @@ -1739,11 +1729,7 @@ restore_write_embedded(struct restorearg *ra, objset_t *os, if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); - data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); - if (data == NULL) - return (ra->err); - - tx = dmu_tx_create(os); + tx = dmu_tx_create(ra->os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); @@ -1753,7 +1739,7 @@ restore_write_embedded(struct restorearg *ra, objset_t *os, return (err); } - dmu_write_embedded(os, drrwnp->drr_object, + dmu_write_embedded(ra->os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); @@ -1763,31 +1749,26 @@ restore_write_embedded(struct restorearg *ra, objset_t *os, } static int -restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) +restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) { dmu_tx_t *tx; - void *data; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) return (SET_ERROR(EINVAL)); - data = restore_read(ra, drrs->drr_length, NULL); - if (data == NULL) - return (ra->err); - - if (dmu_object_info(os, drrs->drr_object, NULL) != 0) + if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } - tx = dmu_tx_create(os); + tx = dmu_tx_create(ra->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1814,8 +1795,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) /* ARGSUSED */ noinline static int -restore_free(struct restorearg *ra, objset_t *os, - struct drr_free *drrf) +restore_free(struct restorearg *ra, struct drr_free *drrf) { int err; @@ -1823,10 +1803,10 @@ restore_free(struct restorearg *ra, objset_t *os, drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(os, drrf->drr_object, + err = dmu_free_long_range(ra->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } @@ -1841,6 +1821,157 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) (void) dsl_destroy_head(name); } +static void +restore_cksum(struct restorearg *ra, int len, void *buf) +{ + if (ra->byteswap) { + fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + } else { + fletcher_4_incremental_native(buf, len, &ra->cksum); + } +} + +/* + * If len != 0, read payload into buf. + * Read next record's header into ra->next_drr. + * Verify checksum of payload and next record. + */ +static int +restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) +{ + int err; + zio_cksum_t cksum_orig; + zio_cksum_t *cksump; + + if (len != 0) { + ASSERT3U(len, <=, ra->bufsize); + err = restore_read(ra, len, buf); + if (err != 0) + return (err); + restore_cksum(ra, len, buf); + } + + ra->prev_cksum = ra->cksum; + + err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); + if (err != 0) + return (err); + if (ra->next_drr->drr_type == DRR_BEGIN) + return (SET_ERROR(EINVAL)); + + /* + * Note: checksum is of everything up to but not including the + * checksum itself. + */ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + restore_cksum(ra, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ra->next_drr); + + cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; + cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; + + if (ra->byteswap) + byteswap_record(ra->next_drr); + + if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) + return (SET_ERROR(ECKSUM)); + + restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); + + return (0); +} + +static int +restore_process_record(struct restorearg *ra) +{ + int err; + + switch (ra->drr->drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &ra->drr->drr_u.drr_object; + err = restore_read_payload_and_next_header(ra, + P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); + if (err != 0) + return (err); + return (restore_object(ra, drro, ra->buf)); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &ra->drr->drr_u.drr_freeobjects; + err = restore_read_payload_and_next_header(ra, 0, NULL); + if (err != 0) + return (err); + return (restore_freeobjects(ra, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &ra->drr->drr_u.drr_write; + arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), + drrw->drr_length); + + err = restore_read_payload_and_next_header(ra, + drrw->drr_length, abuf->b_data); + if (err != 0) + return (err); + err = restore_write(ra, drrw, abuf); + /* if restore_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(abuf); + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &ra->drr->drr_u.drr_write_byref; + err = restore_read_payload_and_next_header(ra, 0, NULL); + if (err != 0) + return (err); + return (restore_write_byref(ra, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &ra->drr->drr_u.drr_write_embedded; + err = restore_read_payload_and_next_header(ra, + P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); + if (err != 0) + return (err); + return (restore_write_embedded(ra, drrwe, ra->buf)); + } + case DRR_FREE: + { + struct drr_free *drrf = &ra->drr->drr_u.drr_free; + err = restore_read_payload_and_next_header(ra, 0, NULL); + if (err != 0) + return (err); + return (restore_free(ra, drrf)); + } + case DRR_END: + { + struct drr_end *drre = &ra->drr->drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) + return (SET_ERROR(EINVAL)); + return (0); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; + err = restore_read_payload_and_next_header(ra, + drrs->drr_length, ra->buf); + if (err != 0) + return (err); + return (restore_spill(ra, drrs, ra->buf)); + } + default: + return (SET_ERROR(EINVAL)); + } +} + /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -1848,10 +1979,8 @@ int dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { + int err = 0; struct restorearg ra = { 0 }; - dmu_replay_record_t *drr; - objset_t *os; - zio_cksum_t pcksum; int featureflags; ra.byteswap = drc->drc_byteswap; @@ -1859,7 +1988,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.vp = vp; ra.voff = *voffp; ra.bufsize = SPA_MAXBLOCKSIZE; + ra.drr = vmem_alloc(sizeof (*ra.drr), KM_SLEEP); ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); + ra.next_drr = vmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -1869,7 +2000,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, /* * Open the objset we are modifying. */ - VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); @@ -1895,13 +2026,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, avl_create(ra.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); - ra.err = zfs_onexit_add_cb(minor, + err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { - ra.err = zfs_onexit_cb_data(minor, *action_handlep, + err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); if (ra.err != 0) goto out; @@ -1910,96 +2041,34 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, drc->drc_guid_to_ds_map = ra.guid_to_ds_map; } - /* - * Read records and process them. - */ - pcksum = ra.cksum; - while (ra.err == 0 && - NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { + err = restore_read_payload_and_next_header(&ra, 0, NULL); + if (err != 0) + goto out; + for (;;) { + void *tmp; + if (issig(JUSTLOOKING) && issig(FORREAL)) { - ra.err = SET_ERROR(EINTR); - goto out; + err = SET_ERROR(EINTR); + break; } - if (ra.byteswap) - backup_byteswap(drr); + tmp = ra.next_drr; + ra.next_drr = ra.drr; + ra.drr = tmp; - switch (drr->drr_type) { - case DRR_OBJECT: - { - /* - * We need to make a copy of the record header, - * because restore_{object,write} may need to - * restore_read(), which will invalidate drr. - */ - struct drr_object drro = drr->drr_u.drr_object; - ra.err = restore_object(&ra, os, &drro); - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects drrfo = - drr->drr_u.drr_freeobjects; - ra.err = restore_freeobjects(&ra, os, &drrfo); - break; - } - case DRR_WRITE: - { - struct drr_write drrw = drr->drr_u.drr_write; - ra.err = restore_write(&ra, os, &drrw); - break; - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref drrwbr = - drr->drr_u.drr_write_byref; - ra.err = restore_write_byref(&ra, os, &drrwbr); - break; - } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded drrwe = - drr->drr_u.drr_write_embedded; - ra.err = restore_write_embedded(&ra, os, &drrwe); - break; - } - case DRR_FREE: - { - struct drr_free drrf = drr->drr_u.drr_free; - ra.err = restore_free(&ra, os, &drrf); + /* process ra.drr, read in ra.next_drr */ + err = restore_process_record(&ra); + if (err != 0) break; - } - case DRR_END: - { - struct drr_end drre = drr->drr_u.drr_end; - /* - * We compare against the *previous* checksum - * value, because the stored checksum is of - * everything before the DRR_END record. - */ - if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) - ra.err = SET_ERROR(ECKSUM); - goto out; - } - case DRR_SPILL: - { - struct drr_spill drrs = drr->drr_u.drr_spill; - ra.err = restore_spill(&ra, os, &drrs); + if (ra.drr->drr_type == DRR_END) break; - } - default: - ra.err = SET_ERROR(EINVAL); - goto out; - } - pcksum = ra.cksum; } - ASSERT(ra.err != 0); out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); - if (ra.err != 0) { + if (err != 0) { /* * destroy what we created, so we don't leave it in the * inconsistent restoring state. @@ -2007,9 +2076,11 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, dmu_recv_cleanup_ds(drc); } + vmem_free(ra.drr, sizeof (*ra.drr)); vmem_free(ra.buf, ra.bufsize); + vmem_free(ra.next_drr, sizeof (*ra.next_drr)); *voffp = ra.voff; - return (ra.err); + return (err); } static int