Skip to content

Commit

Permalink
Fix problems receiving reallocated dnodes
Browse files Browse the repository at this point in the history
This is a port of 047116a - Raw sends must be able to decrease nlevels,
to the zfs-0.7-stable branch.  It includes the various fixes to the
problem of receiving incremental streams which include reallocated dnodes
in which the number of dnode slots has changed but excludes the parts
which are related to raw streams.

From 047116a:

    Currently, when a raw zfs send file includes a
    DRR_OBJECT record that would decrease the number of
    levels of an existing object, the object is reallocated
    with dmu_object_reclaim() which creates the new dnode
    using the old object's nlevels. For non-raw sends this
    doesn't really matter, but raw sends require that
    nlevels on the receive side match that of the send
    side so that the checksum-of-MAC tree can be properly
    maintained. This patch corrects the issue by freeing
    the object completely before allocating it again in
    this case.

    This patch also corrects several issues with
    dnode_hold_impl() and related functions that prevented
    dnodes (particularly multi-slot dnodes) from being
    reallocated properly due to the fact that existing
    dnodes were not being fully cleaned up when they
    were freed.

    This patch adds a test to make sure that zfs recv
    functions properly with incremental streams containing
    dnodes of different sizes.

This also includes a one-liner fix from loli10K to fix a test failure:
openzfs#7792 (comment)

Authored-by: Tom Caputi <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed-by: Jorgen Lundman <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Signed-off-by: Tim Chase <[email protected]>
Ported-by: Tim Chase <[email protected]>

Closes openzfs#6821
Closes openzfs#6864

NOTE: This is the first of the port of 3 related patches patches to the
zfs-0.7-release branch of ZoL.  The other two patches should immediately
follow this one.

Add stuff
  • Loading branch information
dweeezil authored and tonyhutter committed Aug 30, 2018
1 parent 191edfd commit 1ab0710
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 15 deletions.
25 changes: 24 additions & 1 deletion cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled;
extern int zfs_abd_scatter_enabled;
extern int dmu_object_alloc_chunk_shift;

static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
Expand Down Expand Up @@ -310,6 +311,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
ztest_func_t ztest_dmu_object_next_chunk;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
ztest_func_t ztest_zap_parallel;
Expand Down Expand Up @@ -357,6 +359,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
ZTI_INIT(ztest_zap, 30, &zopt_always),
ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
Expand Down Expand Up @@ -3927,6 +3930,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
umem_free(od, size);
}

/*
* Rewind the global allocator to verify object allocation backfilling.
*/
void
ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
{
objset_t *os = zd->zd_os;
int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
uint64_t object;

/*
* Rewind the global allocator randomly back to a lower object number
* to force backfilling and reclamation of recently freed dnodes.
*/
mutex_enter(&os->os_obj_lock);
object = ztest_random(os->os_obj_next_chunk);
os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
mutex_exit(&os->os_obj_lock);
}

#undef OD_ARRAY_SIZE
#define OD_ARRAY_SIZE 2

Expand Down
6 changes: 6 additions & 0 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
void dnode_free_interior_slots(dnode_t *dn);

#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
Expand Down Expand Up @@ -453,6 +454,11 @@ typedef struct dnode_stats {
* which had already been unlinked in an earlier txg.
*/
kstat_named_t dnode_hold_free_txg;
/*
* Number of times dnode_free_interior_slots() needed to retry
* acquiring a slot zrl lock due to contention.
*/
kstat_named_t dnode_free_interior_lock_retry;
/*
* Number of new dnodes allocated by dnode_allocate().
*/
Expand Down
1 change: 1 addition & 0 deletions lib/libzfs/libzfs_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -3577,6 +3577,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
}

newfs = B_TRUE;
*cp = '/';
}

if (flags->verbose) {
Expand Down
1 change: 0 additions & 1 deletion module/zfs/dmu_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
return (err);
}


int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
Expand Down
51 changes: 46 additions & 5 deletions module/zfs/dmu_send.c
Original file line number Diff line number Diff line change
Expand Up @@ -2156,10 +2156,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
}

err = dmu_object_info(rwa->os, drro->drr_object, &doi);

if (err != 0 && err != ENOENT)
if (err != 0 && err != ENOENT && err != EEXIST)
return (SET_ERROR(EINVAL));
object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;

if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
Expand All @@ -2175,13 +2173,56 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);

object = drro->drr_object;

if (drro->drr_blksz != doi.doi_data_block_size ||
nblkptr < doi.doi_nblkptr) {
nblkptr < doi.doi_nblkptr ||
drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
} else if (err == EEXIST) {
/*
* The object requested is currently an interior slot of a
* multi-slot dnode. This will be resolved when the next txg
* is synced out, since the send stream will have told us
* to free this slot when we freed the associated dnode
* earlier in the stream.
*/
txg_wait_synced(dmu_objset_pool(rwa->os), 0);
object = drro->drr_object;
} else {
/* object is free and we are about to allocate a new one */
object = DMU_NEW_OBJECT;
}

/*
* If this is a multi-slot dnode there is a chance that this
* object will expand into a slot that is already used by
* another object from the previous snapshot. We must free
* these objects before we attempt to allocate the new dnode.
*/
if (drro->drr_dn_slots > 1) {
for (uint64_t slot = drro->drr_object + 1;
slot < drro->drr_object + drro->drr_dn_slots;
slot++) {
dmu_object_info_t slot_doi;

err = dmu_object_info(rwa->os, slot, &slot_doi);
if (err == ENOENT || err == EEXIST)
continue;
else if (err != 0)
return (err);

err = dmu_free_long_object(rwa->os, slot);

if (err != 0)
return (err);
}

txg_wait_synced(dmu_objset_pool(rwa->os), 0);
}

tx = dmu_tx_create(rwa->os);
Expand Down Expand Up @@ -2732,7 +2773,7 @@ receive_read_record(struct receive_arg *ra)
* See receive_read_prefetch for an explanation why we're
* storing this object in the ignore_obj_list.
*/
if (err == ENOENT ||
if (err == ENOENT || err == EEXIST ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
Expand Down
84 changes: 78 additions & 6 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
{ "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
{ "dnode_allocate", KSTAT_DATA_UINT64 },
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
{ "dnode_buf_evict", KSTAT_DATA_UINT64 },
Expand Down Expand Up @@ -516,7 +517,8 @@ dnode_destroy(dnode_t *dn)
mutex_exit(&os->os_lock);

/* the dnode can no longer move, so we can release the handle */
zrl_remove(&dn->dn_handle->dnh_zrlock);
if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
zrl_remove(&dn->dn_handle->dnh_zrlock);

dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
Expand Down Expand Up @@ -662,6 +664,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));

dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;

dnode_free_interior_slots(dn);
DNODE_STAT_BUMP(dnode_reallocate);

/* clean up any unreferenced dbufs */
Expand Down Expand Up @@ -1062,19 +1066,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
}

static boolean_t
dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
{
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);

for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
if (dnh->dnh_dnode != ptr)
dnode_t *dn = dnh->dnh_dnode;

if (dn == DN_SLOT_FREE) {
continue;
} else if (DN_SLOT_IS_PTR(dn)) {
mutex_enter(&dn->dn_mtx);
dmu_object_type_t type = dn->dn_type;
mutex_exit(&dn->dn_mtx);

if (type != DMU_OT_NONE)
return (B_FALSE);

continue;
} else {
return (B_FALSE);
}

return (B_FALSE);
}

return (B_TRUE);
}

static void
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);

for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];

ASSERT(zrl_is_locked(&dnh->dnh_zrlock));

if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
}
}
}

void
dnode_free_interior_slots(dnode_t *dn)
{
dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
int idx = (dn->dn_object & (epb - 1)) + 1;
int slots = dn->dn_num_slots - 1;

if (slots == 0)
return;

ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);

while (!dnode_slots_tryenter(children, idx, slots))
DNODE_STAT_BUMP(dnode_free_interior_lock_retry);

dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
dnode_slots_rele(children, idx, slots);
}

void
dnode_special_close(dnode_handle_t *dnh)
{
Expand Down Expand Up @@ -1355,7 +1413,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
while (dn == DN_SLOT_UNINIT) {
dnode_slots_hold(dnc, idx, slots);

if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
Expand All @@ -1368,15 +1426,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
continue;
}

if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
}

/*
* Allocated but otherwise free dnodes which would
* be in the interior of a multi-slot dnodes need
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
if (slots > 1)
dnode_reclaim_slots(dnc, idx + 1, slots - 1);

dnh = &dnc->dnc_children[idx];
dn = dnode_create(os, dn_block + idx, db, object, dnh);
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
dn = dnh->dnh_dnode;
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
}
}

mutex_enter(&dn->dn_mtx);
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,13 +533,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
dnode_free_interior_slots(dn);

mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_have_spill = B_FALSE;
dn->dn_num_slots = 1;
mutex_exit(&dn->dn_mtx);

ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
'send-c_recv_dedup', 'send_freeobjects']
'send-c_recv_dedup', 'send_freeobjects', 'send_realloc_dnode_size']
tags = ['functional', 'rsend']

[tests/functional/scrub_mirror]
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/rsend/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ dist_pkgdata_SCRIPTS = \
send-c_volume.ksh \
send-c_zstreamdump.ksh \
send-cpL_varied_recsize.ksh \
send_freeobjects.ksh
send_freeobjects.ksh \
send_realloc_dnode_size.ksh

dist_pkgdata_DATA = \
rsend.cfg \
Expand Down
Loading

0 comments on commit 1ab0710

Please sign in to comment.