diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 34bfba2581a1..c6810ea919e0 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -248,10 +248,11 @@ get_usage(zfs_help_t idx) case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: - return (gettext("\treceive [-vnFu] \n" - "\treceive [-vnFu] [-o origin=] [-d | -e] " - "\n")); + "\treceive [-vnsFu] [-o origin=] [-d | -e] " + "\n" + "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" @@ -263,7 +264,8 @@ get_usage(zfs_help_t idx) return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " "\n" "\tsend [-Le] [-i snapshot|bookmark] " - "\n")); + "\n" + "\tsend [-nvPe] -t \n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); @@ -3687,6 +3689,7 @@ zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; + char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; @@ -3695,7 +3698,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) { switch (c) { case 'i': if (fromname) @@ -3736,6 +3739,9 @@ zfs_do_send(int argc, char **argv) case 'e': flags.embed_data = B_TRUE; break; + case 't': + resume_token = optarg; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3751,14 +3757,28 @@ zfs_do_send(int argc, char **argv) argc -= optind; argv += optind; - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); + if (resume_token != NULL) { + if (fromname != NULL || flags.replicate || flags.props || + flags.dedup) { + (void) fprintf(stderr, + gettext("invalid flags combined with -t\n")); + usage(B_FALSE); + } + if (argc != 0) { + (void) fprintf(stderr, gettext("no additional " + "arguments are permitted with -t\n")); + usage(B_FALSE); + } + } else { + if (argc < 1) { + (void) fprintf(stderr, + gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } } if (!flags.dryrun && isatty(STDOUT_FILENO)) { @@ -3768,6 +3788,11 @@ zfs_do_send(int argc, char **argv) return (1); } + if (resume_token != NULL) { + return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token)); + } + /* * Special case sending a filesystem, or from a bookmark. */ @@ -3873,8 +3898,6 @@ zfs_do_send(int argc, char **argv) } /* - * zfs receive [-vnFu] [-d | -e] - * * Restore a backup stream from stdin. */ static int @@ -3882,6 +3905,8 @@ zfs_do_receive(int argc, char **argv) { int c, err; recvflags_t flags = { 0 }; + boolean_t abort_resumable = B_FALSE; + nvlist_t *props; nvpair_t *nvp = NULL; @@ -3889,7 +3914,7 @@ zfs_do_receive(int argc, char **argv) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":o:denuvF")) != -1) { + while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) { switch (c) { case 'o': if (parseprop(props, optarg) != 0) @@ -3911,9 +3936,15 @@ zfs_do_receive(int argc, char **argv) case 'v': flags.verbose = B_TRUE; break; + case 's': + flags.resumable = B_TRUE; + break; case 'F': flags.force = B_TRUE; break; + case 'A': + abort_resumable = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3946,6 +3977,44 @@ zfs_do_receive(int argc, char **argv) } } + if (abort_resumable) { + if (flags.isprefix || flags.istail || flags.dryrun || + flags.resumable || flags.nomount) { + (void) fprintf(stderr, gettext("invalid option")); + usage(B_FALSE); + } + + char namebuf[ZFS_MAXNAMELEN]; + (void) snprintf(namebuf, sizeof (namebuf), + "%s/%%recv", argv[0]); + + if (zfs_dataset_exists(g_zfs, namebuf, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { + zfs_handle_t *zhp = zfs_open(g_zfs, + namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + err = zfs_destroy(zhp, B_FALSE); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { + (void) fprintf(stderr, + gettext("'%s' does not have any " + "resumable receive state to abort\n"), + argv[0]); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + } + + return (err != 0); + } + if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " @@ -3953,7 +4022,6 @@ zfs_do_receive(int argc, char **argv) "You must redirect standard input.\n")); return (1); } - err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); return (err != 0); @@ -5784,6 +5852,24 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, return (0); } + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Contains partially-completed state from " + "\"zfs receive -r\", which can be resumed with " + "\"zfs send -t\"\n"), + cmdname, zfs_get_name(zhp)); + return (1); + } + /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index f288d148e574..08d52bb37a83 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -127,7 +127,7 @@ read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) (longlong_t)saved_cksum.zc_word[1], (longlong_t)saved_cksum.zc_word[2], (longlong_t)saved_cksum.zc_word[3]); - exit(1); + return (0); } return (sizeof (*drr)); } @@ -347,8 +347,7 @@ main(int argc, char *argv[]) if (verbose) (void) printf("\n"); - if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) { + if (drr->drr_payloadlen != 0) { nvlist_t *nv; int sz = drr->drr_payloadlen; diff --git a/include/libzfs.h b/include/libzfs.h index 2a1f2f50d306..c0b6785d1fbe 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -631,6 +631,10 @@ typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); +extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, + const char *); +extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, + const char *token); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, @@ -671,6 +675,12 @@ typedef struct recvflags { /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; + /* + * Mark the file systems as "resumable" and do not destroy them if the + * receive is interrupted + */ + boolean_t resumable; + /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; diff --git a/include/libzfs_core.h b/include/libzfs_core.h index bdd6c951ee49..5d3a6fda7dcb 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _LIBZFS_CORE_H @@ -58,7 +58,11 @@ enum lzc_send_flags { }; int lzc_send(const char *, const char *, int, enum lzc_send_flags); +int lzc_send_resume(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t); int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); +int lzc_receive_resumable(const char *, nvlist_t *, const char *, + boolean_t, int); int lzc_send_space(const char *, const char *, uint64_t *); boolean_t lzc_exists(const char *); diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 75d094f0812e..d700d1d17ed3 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -24,7 +24,7 @@ */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -272,6 +272,8 @@ typedef struct dmu_sendarg { uint64_t dsa_featureflags; uint64_t dsa_last_data_object; uint64_t dsa_last_data_offset; + uint64_t dsa_resume_object; + uint64_t dsa_resume_offset; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 2442a1f8aab1..871f5625460e 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -36,10 +36,13 @@ struct vnode; struct dsl_dataset; struct drr_begin; struct avl_tree; +struct dmu_replay_record; -int dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, - int outfd, struct vnode *vp, offset_t *off); +extern const char *recv_clone_name; + +int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + struct vnode *vp, offset_t *off); int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, uint64_t *sizep); int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, @@ -50,12 +53,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, typedef struct dmu_recv_cookie { struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; struct drr_begin *drc_drrb; const char *drc_tofs; const char *drc_tosnap; boolean_t drc_newfs; boolean_t drc_byteswap; boolean_t drc_force; + boolean_t drc_resumable; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; @@ -63,8 +68,9 @@ typedef struct dmu_recv_cookie { cred_t *drc_cred; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc); +int dmu_recv_begin(char *tofs, char *tosnap, + struct dmu_replay_record *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h index 544b721e4612..c010edd440d9 100644 --- a/include/sys/dmu_traverse.h +++ b/include/sys/dmu_traverse.h @@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, + zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index a596642e3130..997c6b5decb5 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -91,6 +91,18 @@ struct dsl_pool; */ #define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" +/* + * These fields are set on datasets that are in the middle of a resumable + * receive, and allow the sender to resume the send if it is interrupted. + */ +#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" +#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" +#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" +#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" +#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" +#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" + /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. @@ -184,6 +196,14 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + /* + * When in the middle of a resumable receive, tracks how much + * progress we have made. + */ + uint64_t ds_resume_object[TXG_SIZE]; + uint64_t ds_resume_offset[TXG_SIZE]; + uint64_t ds_resume_bytes[TXG_SIZE]; + /* Protected by our dsl_dir's dd_lock */ list_t ds_prop_cbs; @@ -235,6 +255,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -315,6 +336,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); void dsl_dataset_deactivate_feature(uint64_t dsobj, diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 536fab785c6d..e6bee0be23e4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -156,6 +156,7 @@ typedef enum { ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, + ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_NUM_PROPS } zfs_prop_t; diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 601a9a70c580..58f68ed02578 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -90,14 +90,15 @@ typedef enum drr_headertype { * Feature flags for zfs send streams (flags in drr_versioninfo) */ -#define DMU_BACKUP_FEATURE_DEDUP (1<<0) -#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1) -#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2) +#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ -#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16) -#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17) +#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) +#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ -#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19) +#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) +#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* * Mask of all supported backup features @@ -105,11 +106,16 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ + DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) +typedef enum dmu_send_resume_token_version { + ZFS_SEND_RESUME_TOKEN_VERSION = 1 +} dmu_send_resume_token_version_t; + /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: @@ -359,14 +365,14 @@ typedef struct zfs_cmd { uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; + dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ + boolean_t zc_resumable; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 2a85b31c902e..bbe7e2e12a7c 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1853,22 +1853,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) return (value); } -static char * +static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; - char *value; + const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); - if ((value = (char *)zfs_prop_default_string(prop)) == NULL) - value = ""; + value = zfs_prop_default_string(prop); *source = ""; } @@ -2290,7 +2289,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, { char *source = NULL; uint64_t val; - char *str; + const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); @@ -2396,14 +2395,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, break; case ZFS_PROP_ORIGIN: - (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), - proplen); - /* - * If there is no parent at all, return failure to indicate that - * it doesn't apply to this dataset. - */ - if (propbuf[0] == '\0') + str = getprop_string(zhp, prop, &source); + if (str == NULL) return (-1); + (void) strlcpy(propbuf, str, proplen); break; case ZFS_PROP_CLONES: @@ -2585,8 +2580,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, break; case PROP_TYPE_STRING: - (void) strlcpy(propbuf, - getprop_string(zhp, prop, &source), proplen); + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + (void) strlcpy(propbuf, str, proplen); break; case PROP_TYPE_INDEX: diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 41ab86131437..558b0df60b0b 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1041,6 +1041,17 @@ mount_cb(zfs_handle_t *zhp, void *data) return (0); } + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + zfs_close(zhp); + return (0); + } + libzfs_add_handle(cbp, zhp); if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { zfs_close(zhp); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 2adcb0c0f532..2082f1e86b27 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved @@ -56,6 +56,7 @@ #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" +#include #include #include #include @@ -66,6 +67,8 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); +static int guid_to_name(libzfs_handle_t *, const char *, + uint64_t, boolean_t, char *); static const zio_cksum_t zero_cksum = { { 0 } }; @@ -283,8 +286,7 @@ cksummer(void *arg) DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { + if (drr->drr_payloadlen != 0) { sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { @@ -1013,17 +1015,14 @@ static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; - zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = pa->pa_zhp; libzfs_handle_t *hdl = zhp->zfs_hdl; unsigned long long bytes; char buf[16]; - time_t t; struct tm *tm; - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (!pa->pa_parsable) @@ -1056,6 +1055,51 @@ send_progress_thread(void *arg) } } +static void +send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, + uint64_t size, boolean_t parsable) +{ + if (parsable) { + if (fromsnap != NULL) { + (void) fprintf(fout, "incremental\t%s\t%s", + fromsnap, tosnap); + } else { + (void) fprintf(fout, "full\t%s", + tosnap); + } + } else { + if (fromsnap != NULL) { + if (strchr(fromsnap, '@') == NULL && + strchr(fromsnap, '#') == NULL) { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from @%s to %s"), + fromsnap, tosnap); + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from %s to %s"), + fromsnap, tosnap); + } + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "full send of %s"), + tosnap); + } + } + + if (size != 0) { + if (parsable) { + (void) fprintf(fout, "\t%llu", + (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + " estimated size is %s"), buf); + } + } + (void) fprintf(fout, "\n"); +} + static int dump_snapshot(zfs_handle_t *zhp, void *arg) { @@ -1135,37 +1179,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) (sdd->fromorigin || sdd->replicate); if (sdd->verbose) { - uint64_t size; - err = estimate_ioctl(zhp, sdd->prevsnap_obj, + uint64_t size = 0; + (void) estimate_ioctl(zhp, sdd->prevsnap_obj, fromorigin, &size); - if (sdd->parsable) { - if (sdd->prevsnap[0] != '\0') { - (void) fprintf(fout, "incremental\t%s\t%s", - sdd->prevsnap, zhp->zfs_name); - } else { - (void) fprintf(fout, "full\t%s", - zhp->zfs_name); - } - } else { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "send from @%s to %s"), - sdd->prevsnap, zhp->zfs_name); - } - if (err == 0) { - if (sdd->parsable) { - (void) fprintf(fout, "\t%llu\n", - (longlong_t)size); - } else { - char buf[16]; - zfs_nicenum(size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - " estimated size is %s\n"), buf); - } - sdd->size += size; - } else { - (void) fprintf(fout, "\n"); - } + send_print_verbose(fout, zhp->zfs_name, + sdd->prevsnap[0] ? sdd->prevsnap : NULL, + size, sdd->parsable); + sdd->size += size; } if (!sdd->dryrun) { @@ -1376,6 +1397,231 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg) return (0); } +nvlist_t * +zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) +{ + unsigned int version; + int nread, i; + unsigned long long checksum, packed_len; + + /* + * Decode token header, which is: + * -- + * Note that the only supported token version is 1. + */ + nread = sscanf(token, "%u-%llx-%llx-", + &version, &checksum, &packed_len); + if (nread != 3) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid format)")); + return (NULL); + } + + if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid version %u)"), + version); + return (NULL); + } + + /* convert hexadecimal representation to binary */ + token = strrchr(token, '-') + 1; + int len = strlen(token) / 2; + unsigned char *compressed = zfs_alloc(hdl, len); + for (i = 0; i < len; i++) { + nread = sscanf(token + i * 2, "%2hhx", compressed + i); + if (nread != 1) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt " + "(payload is not hex-encoded)")); + return (NULL); + } + } + + /* verify checksum */ + zio_cksum_t cksum; + fletcher_4_native(compressed, len, &cksum); + if (cksum.zc_word[0] != checksum) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (incorrect checksum)")); + return (NULL); + } + + /* uncompress */ + void *packed = zfs_alloc(hdl, packed_len); + uLongf packed_len_long = packed_len; + if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || + packed_len_long != packed_len) { + free(packed); + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (decompression failed)")); + return (NULL); + } + + /* unpack nvlist */ + nvlist_t *nv; + int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); + free(packed); + free(compressed); + if (error != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (nvlist_unpack failed)")); + return (NULL); + } + return (nv); +} + +int +zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + const char *resume_token) +{ + char errbuf[1024]; + char *toname; + char *fromname = NULL; + uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; + zfs_handle_t *zhp; + int error = 0; + char name[ZFS_MAXNAMELEN]; + enum lzc_send_flags lzc_flags = 0; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + if (resume_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist + */ + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + if (flags->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "resume token contents:\n")); + nvlist_print(stderr, resume_nvl); + } + + if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || + nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt")); + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + fromguid = 0; + (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + + if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + + if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { + if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is no longer the same snapshot used in " + "the initial send"), toname); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' used in the initial send no longer exists"), + toname); + } + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to access '%s'"), name); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + + if (fromguid != 0) { + if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source %#llx no longer exists"), + (longlong_t)fromguid); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + fromname = name; + } + + if (flags->verbose) { + uint64_t size = 0; + error = lzc_send_space(zhp->zfs_name, fromname, &size); + if (error == 0) + size = MAX(0, (int64_t)(size - bytes)); + send_print_verbose(stderr, zhp->zfs_name, fromname, + size, flags->parsable); + } + + if (!flags->dryrun) { + progress_arg_t pa = { 0 }; + pthread_t tid; + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = outfd; + pa.pa_parsable = flags->parsable; + + error = pthread_create(&tid, NULL, + send_progress_thread, &pa); + if (error != 0) { + zfs_close(zhp); + return (error); + } + } + + error = lzc_send_resume(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff); + + if (flags->progress) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + } + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + zfs_close(zhp); + + switch (error) { + case 0: + return (0); + case EXDEV: + case ENOENT: + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + + zfs_close(zhp); + + return (error); +} + /* * Generate a send stream for the dataset identified by the argument zhp. * @@ -1913,6 +2159,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, typedef struct guid_to_name_data { uint64_t guid; + boolean_t bookmark_ok; char *name; char *skip; } guid_to_name_data_t; @@ -1921,20 +2168,25 @@ static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; + const char *slash; int err; if (gtnd->skip != NULL && - strcmp(zhp->zfs_name, gtnd->skip) == 0) { + (slash = strrchr(zhp->zfs_name, '/')) != NULL && + strcmp(slash + 1, gtnd->skip) == 0) { + zfs_close(zhp); return (0); } - if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); + if (err != EEXIST && gtnd->bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } @@ -1948,45 +2200,48 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - char *name) + boolean_t bookmark_ok, char *name) { - /* exhaustive search all local snapshots */ char pname[ZFS_MAXNAMELEN]; guid_to_name_data_t gtnd; - int err = 0; - zfs_handle_t *zhp; - char *cp; gtnd.guid = guid; + gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; - (void) strlcpy(pname, parent, sizeof (pname)); - /* - * Search progressively larger portions of the hierarchy. This will + * Search progressively larger portions of the hierarchy, starting + * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ - while ((cp = strrchr(pname, '/')) != NULL) { - + (void) strlcpy(pname, parent, sizeof (pname)); + char *cp = strrchr(pname, '@'); + if (cp == NULL) + cp = strchr(pname, '\0'); + for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; - zhp = make_dataset_handle(hdl, pname); + zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; - - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); + if (err != EEXIST) + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + if (err != EEXIST && bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* - * Remember the dataset that we already searched, so we - * skip it next time through. + * Remember the last portion of the dataset so we skip it next + * time through (as we've already searched that portion of the + * hierarchy). */ - gtnd.skip = pname; + gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); @@ -2587,11 +2842,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) switch (drr->drr_type) { case DRR_BEGIN: - /* NB: not to be used on v2 stream packages */ if (drr->drr_payloadlen != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid substream header")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + (void) recv_read(hdl, fd, buf, + drr->drr_payloadlen, B_FALSE, NULL); } break; @@ -2652,6 +2905,40 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) return (-1); } +static void +recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, + boolean_t resumable) +{ + char target_fs[ZFS_MAXNAMELEN]; + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream")); + + if (!resumable) + return; + (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); + *strchr(target_fs, '@') = '\0'; + zfs_handle_t *zhp = zfs_open(hdl, target_fs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return; + + char token_buf[ZFS_MAXPROPLEN]; + int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), + NULL, NULL, 0, B_TRUE); + if (error == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream.\n" + "Partially received snapshot is saved.\n" + "A resuming stream can be generated on the sending " + "system by running:\n" + " zfs send -t %s"), + token_buf); + } + zfs_close(zhp); +} + /* * Restores a backup of tosnap from the file descriptor specified by infd. */ @@ -2800,7 +3087,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, */ if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, zc.zc_value, - drrb->drr_fromguid, zc.zc_string) != 0) { + drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), @@ -2816,8 +3103,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zc.zc_string); } + boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING; stream_wantsnewfs = (drrb->drr_fromguid == 0 || - (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap); + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; if (stream_wantsnewfs) { /* @@ -2836,7 +3125,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char suffix[ZFS_MAXNAMELEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); if (guid_to_name(hdl, zc.zc_name, parent_snapguid, - zc.zc_value) == 0) { + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); } @@ -2863,7 +3152,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char snap[ZFS_MAXNAMELEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, - zc.zc_value) == 0) { + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); } @@ -2877,11 +3166,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_handle_t *zhp; /* - * Destination fs exists. Therefore this should either - * be an incremental, or the stream specifies a new fs - * (full stream or clone) and they want us to blow it - * away (and have therefore specified -F and removed any - * snapshots). + * Destination fs exists. It must be one of these cases: + * - an incremental send stream + * - the stream specifies a new fs (full stream or clone) + * and they want us to blow away the existing fs (and + * have therefore specified -F and removed any snapshots) + * - we are resuming a failed receive. */ if (stream_wantsnewfs) { if (!flags->force) { @@ -2936,6 +3226,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (-1); } } + + /* + * If we are resuming a newfs, set newfs here so that we will + * mount it if the recv succeeds this time. We can tell + * that it was a newfs on the first recv because the fs + * itself will be inconsistent (if the fs existed when we + * did the first recv, we would have received it into + * .../%recv). + */ + if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) + newfs = B_TRUE; + zfs_close(zhp); } else { /* @@ -2968,9 +3270,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, newfs = B_TRUE; } - zc.zc_begin_record = drr_noswap->drr_u.drr_begin; + zc.zc_begin_record = *drr_noswap; zc.zc_cookie = infd; zc.zc_guid = flags->force; + zc.zc_resumable = flags->resumable; if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", @@ -3107,8 +3410,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid stream (checksum mismatch)")); + recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: @@ -3310,7 +3612,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. - * (-1 will override -2). + * (-1 will override -2, if -1 and the resumable flag was specified the + * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index b706e6f6be88..220792300b8c 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -467,6 +467,13 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp) int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) +{ + return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); +} + +int +lzc_send_resume(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) { nvlist_t *args; int err; @@ -479,6 +486,10 @@ lzc_send(const char *snapname, const char *from, int fd, fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + } err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); @@ -536,22 +547,9 @@ recv_read(int fd, void *buf, int ilen) return (0); } -/* - * The simplest receive case: receive from the specified fd, creating the - * specified snapshot. Apply the specified properties a "received" properties - * (which can be overridden by locally-set properties). If the stream is a - * clone, its origin snapshot must be specified by 'origin'. The 'force' - * flag will cause the target filesystem to be rolled back or destroyed if - * necessary to receive. - * - * Return 0 on success or an errno on failure. - * - * Note: this interface does not work on dedup'd streams - * (those with DMU_BACKUP_FEATURE_DEDUP). - */ -int -lzc_receive(const char *snapname, nvlist_t *props, const char *origin, - boolean_t force, int fd) +static int +lzc_receive_impl(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, boolean_t resumable, int fd) { /* * The receive ioctl is still legacy, so we need to construct our own @@ -561,7 +559,6 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, char *atp; char *packed = NULL; size_t size; - dmu_replay_record_t drr; int error; ASSERT3S(g_refcount, >, 0); @@ -597,10 +594,9 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); /* zc_begin_record is non-byteswapped BEGIN record */ - error = recv_read(fd, &drr, sizeof (drr)); + error = recv_read(fd, &zc.zc_begin_record, sizeof (zc.zc_begin_record)); if (error != 0) goto out; - zc.zc_begin_record = drr.drr_u.drr_begin; /* zc_cookie is fd to read from */ zc.zc_cookie = fd; @@ -608,6 +604,8 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, /* zc guid is force flag */ zc.zc_guid = force; + zc.zc_resumable = resumable; + /* zc_cleanup_fd is unused */ zc.zc_cleanup_fd = -1; @@ -622,6 +620,39 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, return (error); } +/* + * The simplest receive case: receive from the specified fd, creating the + * specified snapshot. Apply the specified properties as "received" properties + * (which can be overridden by locally-set properties). If the stream is a + * clone, its origin snapshot must be specified by 'origin'. The 'force' + * flag will cause the target filesystem to be rolled back or destroyed if + * necessary to receive. + * + * Return 0 on success or an errno on failure. + * + * Note: this interface does not work on dedup'd streams + * (those with DMU_BACKUP_FEATURE_DEDUP). + */ +int +lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (lzc_receive_impl(snapname, props, origin, force, B_FALSE, fd)); +} + +/* + * Like lzc_receive, but if the receive fails due to premature stream + * termination, the intermediate state will be preserved on disk. In this + * case, ECKSUM will be returned. The receive may subsequently be resumed + * with a resuming send stream generated by lzc_send_resume(). + */ +int +lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (lzc_receive_impl(snapname, props, origin, force, B_TRUE, fd)); +} + /* * Roll back this filesystem or volume to its most recent snapshot. * If snapnamebuf is not NULL, it will be filled in with the name diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 3290ababec5b..fe916c905476 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -179,17 +179,27 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +\fBzfs\fR \fBsend\fR [\fB-Le\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf -\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +\fBzfs\fR \fBsend\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR .fi .LP .nf -\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR +\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBreceive\fR \fB-A\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP @@ -531,6 +541,17 @@ For file systems, indicates whether the file system is currently mounted. This p For cloned file systems or volumes, the snapshot from which the clone was created. See also the \fBclones\fR property. .RE +.sp +.ne 2 +.mk +.na +\fB\fBreceive_resume_token\fR\fR +.ad +.sp .6 +.RS 4n +For filesystems or volumes which have saved partially-completed state from \fBzfs receive -s\fR , this opaque token can be provided to \fBzfs send -t\fR to resume and complete the \fBzfs receive\fR. +.RE + .sp .ne 2 .mk @@ -2862,7 +2883,7 @@ The format of the stream is committed. You will be able to receive your streams .sp .ne 2 .na -\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +\fBzfs send\fR [\fB-Le\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .ad .sp .6 .RS 4n @@ -2872,24 +2893,6 @@ the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be "--head--". -.sp -.ne 2 -.na -\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR -.ad -.sp .6 -.RS 4n -Generate an incremental send stream. The incremental source must be an earlier -snapshot in the destination's history. It will commonly be an earlier -snapshot in the destination's filesystem, in which case it can be -specified as the last component of the name (the \fB#\fR or \fB@\fR character -and following). -.sp -If the incremental target is a clone, the incremental source can -be the origin snapshot, or an earlier snapshot in the origin's filesystem, -or the origin's origin, etc. -.RE - .sp .ne 2 .mk @@ -2924,16 +2927,46 @@ then the receiving system must have that feature enabled as well. See \fBembedded_data\fR feature. .RE +.sp +.ne 2 +.na +\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR +.ad +.sp .6 +.RS 4n +Generate an incremental send stream. The incremental source must be an earlier +snapshot in the destination's history. It will commonly be an earlier +snapshot in the destination's filesystem, in which case it can be +specified as the last component of the name (the \fB#\fR or \fB@\fR character +and following). +.sp +If the incremental target is a clone, the incremental source can +be the origin snapshot, or an earlier snapshot in the origin's filesystem, +or the origin's origin, etc. +.RE + +.RE +.sp +.ne 2 +.na +\fB\fBzfs send\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR\fR +.ad +.sp .6 +.RS 4n +Creates a send stream which resumes an interrupted receive. The \fIreceive_resume_token\fR is the value of this property on the filesystem or volume that was being received into. See the documentation for \fBzfs receive -s\fR for more details. + +.RE + .RE .sp .ne 2 .mk .na -\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR +\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na -\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR +\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR .ad .sp .6 .RS 4n @@ -2952,21 +2985,36 @@ The \fB-d\fR and \fB-e\fR options cause the file system name of the target snaps .ne 2 .mk .na -\fB\fB-d\fR\fR +\fB\fB-F\fR\fR .ad .sp .6 .RS 4n -Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above. +Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side. .RE .sp .ne 2 +.mk .na -\fB\fB-e\fR\fR +\fB\fB-n\fR\fR .ad .sp .6 .RS 4n -Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above. +Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. +.RE + +.sp +.ne 2 +.na +\fB\fB-s\fR\fR +.ad +.sp .6 +.RS 4n +If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream (e.g. due to network failure or failure of the remote system if the stream is being read over a network connection), a checksum error in the stream, termination of the \fBzfs receive\fR process, or unclean shutdown of the system. +.sp +The receive can be resumed with a stream generated by \fBzfs send -t\fR token, where the \fItoken\fR is the value of the \fBreceive_resume_token\fR property of the filesystem or volume which is received into. +.sp +To use this flag, the storage pool must have the \fBextensible_dataset\fR feature enabled. See \fBzpool-features\fR(5) for details on ZFS feature flags. .RE .sp @@ -2994,11 +3042,21 @@ Print verbose information about the stream and the time required to perform the .ne 2 .mk .na -\fB\fB-n\fR\fR +\fB\fB-d\fR\fR .ad .sp .6 .RS 4n -Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. +Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above. +.RE + +.sp +.ne 2 +.na +\fB\fB-e\fR\fR +.ad +.sp .6 +.RS 4n +Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above. .RE .sp @@ -3012,16 +3070,17 @@ Do not actually receive the stream. This can be useful in conjunction with the \ Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin. .RE +.RE + .sp .ne 2 .mk .na -\fB\fB-F\fR\fR +\fB\fBzfs receive\fR [\fB-A\fR] \fIfilesystem\fR|\fIvolume\fR .ad .sp .6 .RS 4n -Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side. -.RE +Abort an interrupted \fBzfs receive \fB-s\fR\fR, deleting its saved partially received state. .RE diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 5a88cbe6c7b0..d4fc6d371603 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -360,6 +360,10 @@ zfs_prop_init(void) zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", "ROOTCONTEXT"); + zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, + "receive_resume_token", + NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "RESUMETOK"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 7e3a5fb208cc..bf84fd04774a 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -362,6 +362,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -413,6 +424,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, recordsize_changed_cb, os); } } + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); @@ -471,6 +484,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 88d27c867f05..9bc1ebb8081e 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -61,12 +61,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; -static const char *recv_clone_name = "%recv"; +const char *recv_clone_name = "%recv"; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) +static void byteswap_record(dmu_replay_record_t *drr); + struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ @@ -74,6 +76,7 @@ struct send_thread_arg { int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; + zbookmark_phys_t resume; }; struct send_block_record { @@ -96,7 +99,7 @@ dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; dmu_sendarg_t *dsp = dbi->dbi_dsp; - dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); ssize_t resid; /* have to get resid to get detailed errno */ ASSERT0(dbi->dbi_len % 8); @@ -177,7 +180,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given - * object+offset. We know that the one-record constraint is + * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * @@ -425,6 +428,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); @@ -505,6 +521,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t record_size; int err = 0; + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + if (sta->cancel) return (SET_ERROR(EINTR)); @@ -541,8 +560,10 @@ send_traverse_thread(void *arg) struct send_block_record *data; if (st_arg->ds != NULL) { - err = traverse_dataset(st_arg->ds, st_arg->fromtxg, - st_arg->flags, send_cb, arg); + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + if (err != EINTR) st_arg->error_code = err; } @@ -572,6 +593,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT3U(zb->zb_level, >=, 0); + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); + if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); @@ -633,6 +657,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) uint64_t offset; ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { @@ -693,8 +721,10 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, - boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) + zfs_bookmark_phys_t *ancestor_zb, + boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, + vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; @@ -702,8 +732,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; - struct send_thread_arg to_arg; + void *payload = NULL; + struct send_thread_arg to_arg = {{{ 0 }}}; struct send_block_record *to_data; + size_t payload_len; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { @@ -740,6 +772,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); @@ -775,6 +811,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); @@ -783,7 +821,27 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); - if (dump_record(dsp, NULL, 0) != 0) { + payload_len = 0; + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + nvlist_t *nvl; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { err = dsp->dsa_err; goto out; } @@ -893,19 +951,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, outfd, 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, outfd, 0, 0, vp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int -dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, - int outfd, vnode_t *vp, offset_t *off) +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; @@ -972,10 +1030,12 @@ dmu_send(const char *tosnap, const char *fromsnap, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, vp, off); } if (owned) dsl_dataset_disown(ds, FTAG); @@ -1215,6 +1275,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || @@ -1227,6 +1288,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the @@ -1330,15 +1395,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; - uint64_t crflags; + uint64_t crflags = 0; - crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? - DS_FLAG_CI_DATASET : 0; + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1375,6 +1441,32 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + if (drba->drba_cookie->drc_resumable) { + uint64_t one = 1; + uint64_t zero = 0; + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1392,56 +1484,191 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal_ds(newds, "receive", tx, ""); } +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + char recvname[ZFS_MAXNAMELEN]; + const char *tofs = drba->drba_cookie->drc_tofs; + uint64_t val; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + char recvname[ZFS_MAXNAMELEN]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; - dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_resumable = resumable; drc->drc_cred = CRED(); - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - else if (drrb->drr_magic != DMU_BACKUP_MAGIC) - return (SET_ERROR(EINVAL)); - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (drc->drc_byteswap) { - fletcher_4_incremental_byteswap(drr, + fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - fletcher_4_incremental_native(drr, + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - - if (drc->drc_byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } else { + return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); - return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } } struct receive_record_arg { @@ -1453,6 +1680,7 @@ struct receive_record_arg { */ arc_buf_t *write_buf; int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; @@ -1461,6 +1689,7 @@ struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; + /* * These three args are used to signal to the main thread that we're * done. @@ -1468,15 +1697,20 @@ struct receive_writer_arg { kmutex_t mutex; kcondvar_t cv; boolean_t done; + int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object, last_offset; + uint64_t bytes_read; /* bytes read when current record created */ }; struct receive_arg { objset_t *os; vnode_t *vp; /* The vnode to read the stream from */ uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. @@ -1548,14 +1782,21 @@ receive_read(struct receive_arg *ra, int len, void *buf) ra->voff, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - if (resid == len - done) - ra->err = SET_ERROR(EINVAL); + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } + ra->bytes_read += len; + ASSERT3U(done, ==, len); return (0); } @@ -1658,6 +1899,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } } +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) @@ -1754,6 +2032,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); + return (0); } @@ -1779,6 +2058,7 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); } + return (0); } @@ -1794,6 +2074,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -1816,8 +2108,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); + return (0); } @@ -1876,43 +2177,48 @@ receive_write_byref(struct receive_writer_arg *rwa, dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwnp, void *data) + struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; - if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); - if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); - if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); - if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrwnp->drr_object, - drrwnp->drr_offset, drrwnp->drr_length); + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - dmu_write_embedded(rwa->os, drrwnp->drr_object, - drrwnp->drr_offset, data, drrwnp->drr_etype, - drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } @@ -1960,6 +2266,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); + return (0); } @@ -1986,10 +2293,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { - char name[MAXNAMELEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[MAXNAMELEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } } static void @@ -2018,12 +2331,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - err = receive_read(ra, len, ra->rrd->payload); + err = receive_read(ra, len, buf); if (err != 0) return (err); - receive_cksum(ra, len, ra->rrd->payload); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } } ra->prev_cksum = ra->cksum; @@ -2031,6 +2349,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; @@ -2210,7 +2529,7 @@ receive_read_record(struct receive_arg *ra) { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: @@ -2237,6 +2556,10 @@ receive_process_record(struct receive_writer_arg *rwa, { int err; + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2331,6 +2654,33 @@ receive_writer_thread(void *arg) mutex_exit(&rwa->mutex); } +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2352,11 +2702,21 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, struct receive_writer_arg rwa = { 0 }; int featureflags; struct receive_ign_obj_node *n; + uint32_t payloadlen; + void *payload; + nvlist_t *begin_nvl = NULL; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra.bytes_read), 1, &ra.bytes_read); + } + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), offsetof(struct receive_ign_obj_node, node)); @@ -2409,9 +2769,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = receive_read_payload_and_next_header(&ra, 0, NULL); - if (err) + payloadlen = drc->drc_drr_begin->drr_payloadlen; + payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(&ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(&ra, begin_nvl); + if (err != 0) + goto out; + } (void) bqueue_init(&rwa.q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); @@ -2419,6 +2799,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); rwa.os = ra.os; rwa.byteswap = drc->drc_byteswap; + rwa.resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, TS_RUN, minclsyspri); @@ -2477,13 +2858,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, err = rwa.err; out: + nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* - * destroy what we created, so we don't leave it in the - * inconsistent restoring state. + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } @@ -2643,6 +3026,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 07164161bb94..23d763a4ebdb 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -47,6 +47,7 @@ typedef struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; + zbookmark_phys_t pd_resume; } prefetch_data_t; typedef struct traverse_data { @@ -310,23 +311,23 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - dnode_phys_t *cdnp; + dnode_phys_t *child_dnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - cdnp = buf->b_data; + child_dnp = buf->b_data; for (i = 0; i < epb; i++) { - prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { - err = traverse_dnode(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } @@ -394,9 +395,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. */ - td->td_resume->zb_blkid = zb->zb_blkid << - (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } td->td_paused = B_TRUE; } @@ -428,6 +435,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); @@ -505,6 +516,7 @@ traverse_prefetch_thread(void *arg) td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -534,12 +546,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); - /* - * The data prefetching mechanism (the prefetch thread) is incompatible - * with resuming from a bookmark. - */ - ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); - td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP); pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP); czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); @@ -563,6 +569,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, } pd->pd_flags = flags; + if (resume != NULL) + pd->pd_resume = *resume; mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); @@ -615,11 +623,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, * in syncing context). */ int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, - blkptr_cb_t func, void *arg) +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); } int @@ -652,7 +668,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, /* visit each dataset */ for (obj = 1; err == 0; - err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index a5a9694fc14a..d8aedab12300 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -50,6 +50,9 @@ #include #include #include +#include +#include +#include /* * The SPA supports block sizes up to 16MB. However, very large blocks @@ -702,6 +705,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; @@ -712,6 +716,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) return (gotit); } +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + static void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { @@ -1622,6 +1636,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + dmu_objset_sync(ds->ds_objset, zio, tx); for (f = 0; f < SPA_FEATURES; f++) { @@ -1677,6 +1706,78 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) nvlist_free(propval); } +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + char buf[256]; + size_t packed_size, compressed_size; + zio_cksum_t cksum; + int i; + char *propval; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + fletcher_4_native(compressed, compressed_size, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + strfree(propval); + } +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { @@ -1750,6 +1851,28 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) } } + if (!dsl_dataset_is_snapshot(ds)) { + char recvname[ZFS_MAXNAMELEN]; + dsl_dataset_t *recv_ds; + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + dsl_dataset_name(ds, recvname); + (void) strcat(recvname, "/"); + (void) strcat(recvname, recv_clone_name); + if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } } void @@ -1781,6 +1904,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) dsl_dataset_rele(ods, FTAG); } } + } uint64_t @@ -3414,6 +3538,23 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} + #if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_LP64) module_param(zfs_max_recordsize, int, 0644); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index d0015d1bd97b..4dae02ea8f04 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -976,9 +976,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { - boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + dmu_objset_rele(os, FTAG); - if (inconsistent) + if (need_destroy) (void) dsl_destroy_head(dsname); } return (0); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0030799023fd..b0ff59a7260d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3975,6 +3975,7 @@ static boolean_t zfs_ioc_recv_inject_err; * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read @@ -4021,13 +4022,13 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (SET_ERROR(EBADF)); } - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, - &zc->zc_begin_record, force, origin, &drc); + &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; @@ -5156,6 +5157,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. * } * * outnvl is unused @@ -5171,6 +5174,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) file_t *fp; boolean_t largeblockok; boolean_t embedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5181,12 +5186,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + if ((fp = getf(fd)) == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, - fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, largeblockok, fd, + resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off;