Skip to content

Commit

Permalink
OpenZFS 6393 - zfs receive a full send as a clone
Browse files Browse the repository at this point in the history
Authored by: Paul Dagnelie <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Prakash Surya <[email protected]>
Reviewed by: Richard Elling <[email protected]>
Approved by: Dan McDonald <[email protected]>
Ported-by: Brian Behlendorf <[email protected]>

OpenZFS-issue: https://www.illumos.org/issues/6394
OpenZFS-commit: openzfs/openzfs@68ecb2e
  • Loading branch information
pcd1193182 authored and behlendorf committed Jun 28, 2016
1 parent fd41e93 commit e6d3a84
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 64 deletions.
3 changes: 1 addition & 2 deletions include/sys/dmu_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_DMU_IMPL_H
Expand Down Expand Up @@ -268,7 +268,6 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;
Expand Down
12 changes: 11 additions & 1 deletion include/sys/zfs_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_ZFS_IOCTL_H
Expand Down Expand Up @@ -138,6 +138,16 @@ typedef enum dmu_send_resume_token_version {

#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
/*
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
* records that are created by the sending process. This means that the send
* stream can be received as a clone, even though it is not an incremental.
* This is not implemented as a feature flag, because the receiving side does
* not need to have implemented it to receive this stream; it is fully backwards
* compatible. We need a flag, though, because full send streams without it
* cannot necessarily be received as a clone correctly.
*/
#define DRR_FLAG_FREERECORDS (1<<2)

/*
* flags in the drr_checksumflags field in the DRR_WRITE and
Expand Down
9 changes: 7 additions & 2 deletions man/man8/zfs.8
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
.\"
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011 Joshua M. Clulow <[email protected]>
.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved.
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
Expand Down Expand Up @@ -2991,7 +2991,12 @@ Discard all but the last element of the sent snapshot's file system name, using
.ad
.sp .6
.RS 4n
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
Forces the stream to be received as a clone of the given snapshot.
If the stream is a full send stream, this will create the filesystem
described by the stream as a clone of the specified snapshot. Which
snapshot was specified will not affect the success or failure of the
receive, as long as the snapshot does exist. If the stream is an
incremental send stream, all the normal verification will be performed.
.RE

.RE
Expand Down
168 changes: 110 additions & 58 deletions module/zfs/dmu_send.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/

Expand Down Expand Up @@ -173,6 +172,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
return (0);
}

/*
* Fill in the drr_free struct, or perform aggregation if the previous record is
* also a free record, and the two are adjacent.
*
* Note that we send free records even for a full send, because we want to be
* able to receive a full send as a clone, which requires a list of all the free
* and freeobject records that were generated on the source.
*/
static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length)
Expand All @@ -196,15 +203,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));

/*
* If we are doing a non-incremental send, then there can't
* be any data in the dataset we're receiving into. Therefore
* a free record would simply be a no-op. Save space by not
* sending it to begin with.
*/
if (!dsp->dsa_incremental)
return (0);

if (length != -1ULL && offset + length < offset)
length = -1ULL;

Expand Down Expand Up @@ -382,10 +380,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);

/* See comment in dump_free(). */
if (!dsp->dsa_incremental)
return (0);

/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
Expand Down Expand Up @@ -796,6 +790,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;

if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid =
Expand All @@ -818,7 +813,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj;
dsp->dsa_resume_offset = resumeoff;
Expand Down Expand Up @@ -1336,7 +1330,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* target fs already exists; recv into temp clone */

/* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE) {
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
Expand All @@ -1355,6 +1349,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
drba->drba_origin))
return (SET_ERROR(ENOENT));

/*
* If we're receiving a full send as a clone, and it doesn't
* contain all the necessary free records and freeobject
* records, reject it.
*/
if (fromguid == 0 && drba->drba_origin &&
!(flags & DRR_FLAG_FREERECORDS))
return (SET_ERROR(EINVAL));

/* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
Expand Down Expand Up @@ -1394,7 +1397,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
fromguid != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV));
Expand Down Expand Up @@ -1724,6 +1728,20 @@ struct receive_writer_arg {
uint64_t bytes_read; /* bytes read when current record created */
};

struct objlist {
list_t list; /* List of struct receive_objnode. */
/*
* Last object looked up. Used to assert that objects are being looked
* up in ascending order.
*/
uint64_t last_lookup;
};

struct receive_objnode {
list_node_t node;
uint64_t object;
};

struct receive_arg {
objset_t *os;
vnode_t *vp; /* The vnode to read the stream from */
Expand All @@ -1741,12 +1759,7 @@ struct receive_arg {
int err;
boolean_t byteswap;
/* Sorted list of objects not to issue prefetches for. */
list_t ignore_obj_list;
};

struct receive_ign_obj_node {
list_node_t node;
uint64_t object;
struct objlist ignore_objlist;
};

typedef struct guid_map_entry {
Expand Down Expand Up @@ -2063,13 +2076,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
int next_err = 0;

if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));

for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
dmu_object_info_t doi;
int err;

Expand All @@ -2085,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
}

if (next_err != ESRCH)
return (next_err);
return (0);
}

Expand Down Expand Up @@ -2415,6 +2430,70 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
return (0);
}

static void
objlist_create(struct objlist *list)
{
list_create(&list->list, sizeof (struct receive_objnode),
offsetof(struct receive_objnode, node));
list->last_lookup = 0;
}

static void
objlist_destroy(struct objlist *list)
{
struct receive_objnode *n;

for (n = list_remove_head(&list->list);
n != NULL; n = list_remove_head(&list->list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&list->list);
}

/*
* This function looks through the objlist to see if the specified object number
* is contained in the objlist. In the process, it will remove all object
* numbers in the list that are smaller than the specified object number. Thus,
* any lookup of an object number smaller than a previously looked up object
* number will always return false; therefore, all lookups should be done in
* ascending order.
*/
static boolean_t
objlist_exists(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = list_head(&list->list);
ASSERT3U(object, >=, list->last_lookup);
list->last_lookup = object;
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&list->list));
kmem_free(node, sizeof (*node));
node = list_head(&list->list);
}
return (node != NULL && node->object == object);
}

/*
* The objlist is a list of object numbers stored in ascending order. However,
* the insertion of new object numbers does not seek out the correct location to
* store a new object number; instead, it appends it to the list for simplicity.
* Thus, any users must take care to only insert new object numbers in ascending
* order.
*/
static void
objlist_insert(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
node->object = object;
#ifdef ZFS_DEBUG
{
struct receive_objnode *last_object = list_tail(&list->list);
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
}
#endif
list_insert_tail(&list->list, node);
}

/*
* Issue the prefetch reads for any necessary indirect blocks.
*
Expand All @@ -2437,13 +2516,7 @@ static void
receive_read_prefetch(struct receive_arg *ra,
uint64_t object, uint64_t offset, uint64_t length)
{
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
kmem_free(node, sizeof (*node));
node = list_head(&ra->ignore_obj_list);
}
if (node == NULL || node->object > object) {
if (!objlist_exists(&ra->ignore_objlist, object)) {
dmu_prefetch(ra->os, object, 1, offset, length,
ZIO_PRIORITY_SYNC_READ);
}
Expand Down Expand Up @@ -2476,20 +2549,7 @@ receive_read_record(struct receive_arg *ra)
*/
if (err == ENOENT ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
struct receive_ign_obj_node *node =
kmem_zalloc(sizeof (*node),
KM_SLEEP);
node->object = drro->drr_object;
#ifdef ZFS_DEBUG
{
struct receive_ign_obj_node *last_object =
list_tail(&ra->ignore_obj_list);
uint64_t last_objnum = (last_object != NULL ?
last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
}
#endif
list_insert_tail(&ra->ignore_obj_list, node);
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
}
return (err);
Expand Down Expand Up @@ -2706,7 +2766,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
return (0);
}


/*
* Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a
Expand All @@ -2727,7 +2786,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
struct receive_arg *ra;
struct receive_writer_arg *rwa;
int featureflags;
struct receive_ign_obj_node *n;
uint32_t payloadlen;
void *payload;
nvlist_t *begin_nvl = NULL;
Expand All @@ -2746,8 +2804,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
sizeof (ra->bytes_read), 1, &ra->bytes_read);
}

list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
offsetof(struct receive_ign_obj_node, node));
objlist_create(&ra->ignore_objlist);

/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
Expand Down Expand Up @@ -2901,12 +2958,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
}

*voffp = ra->voff;

for (n = list_remove_head(&ra->ignore_obj_list); n != NULL;
n = list_remove_head(&ra->ignore_obj_list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&ra->ignore_obj_list);
objlist_destroy(&ra->ignore_objlist);
kmem_free(ra, sizeof (*ra));
kmem_free(rwa, sizeof (*rwa));
return (err);
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ tests = []
[tests/functional/cli_root/zfs_receive]
tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
'zfs_receive_005_neg', 'zfs_receive_006_pos',
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg']
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
'zfs_receive_010_pos']

# DISABLED:
# zfs_rename_002_pos - needs investigation
Expand Down

1 comment on commit e6d3a84

@loli10K
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Link to the Illumos bug tracker in the commit message is off by one

Please sign in to comment.