Skip to content

Commit

Permalink
3104 eliminate empty bpobjs
Browse files Browse the repository at this point in the history
Reviewed by: George Wilson <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Garrett D'Amore <[email protected]>
Approved by: Eric Schrock <[email protected]>
  • Loading branch information
ahrens committed Aug 27, 2012
1 parent f4c46b1 commit f174573
Show file tree
Hide file tree
Showing 12 changed files with 166 additions and 11 deletions.
3 changes: 3 additions & 0 deletions usr/src/common/zfs/zfeature_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy",
"Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
"Snapshots use less space.", B_TRUE, B_FALSE, NULL);
}
1 change: 1 addition & 0 deletions usr/src/common/zfs/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);

enum spa_feature {
SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURES
} spa_feature_t;

Expand Down
28 changes: 28 additions & 0 deletions usr/src/man/man5/zpool-features.5
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,33 @@ through the \fBfreeing\fR property.

This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
.RE

.sp
.ne 2
.na
\fB\fBempty_bpobj\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:empty_bpobj
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE

This feature increases the performance of creating and using a large
number of snapshots of a single filesystem or volume, and also reduces
the disk space required.

When there are many snapshots, each snapshot uses many Block Pointer
Objects (bpobj's) to track blocks associated with that snapshot.
However, in common use cases, most of these bpobj's are empty. This
feature allows us to create each bpobj on-demand, thus eliminating the
empty bpobjs.

This feature is \fBactive\fR while there are any filesystems, volumes,
or snapshots which were created after enabling this feature.
.RE

.SH "SEE ALSO"
\fBzpool\fR(1M)
58 changes: 57 additions & 1 deletion usr/src/uts/common/fs/zfs/bpobj.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,61 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/dsl_pool.h>
#include <sys/zfeature.h>
#include <sys/zap.h>

/*
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
*/
uint64_t
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
spa_t *spa = dmu_objset_spa(os);
dsl_pool_t *dp = dmu_objset_pool(os);

if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
ASSERT3U(dp->dp_empty_bpobj, ==, 0);
dp->dp_empty_bpobj =
bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj, tx) == 0);
}
spa_feature_incr(spa, empty_bpobj_feat, tx);
ASSERT(dp->dp_empty_bpobj != 0);
return (dp->dp_empty_bpobj);
} else {
return (bpobj_alloc(os, blocksize, tx));
}
}

void
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
dsl_pool_t *dp = dmu_objset_pool(os);

spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, tx));
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
dp->dp_empty_bpobj = 0;
}
}

uint64_t
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
Expand All @@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
int epb;
dmu_buf_t *dbuf = NULL;

ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));

mutex_enter(&bpo.bpo_lock);
Expand Down Expand Up @@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)

ASSERT(bpo->bpo_havesubobj);
ASSERT(bpo->bpo_havecomp);
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
bpobj_decr_empty(bpo->bpo_os, tx);
return;
}

VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
Expand Down Expand Up @@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
blkptr_t *bparray;

ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

/* We never need the fill count. */
stored_bp.blk_fill = 0;
Expand Down
54 changes: 45 additions & 9 deletions usr/src/uts/common/fs/zfs/dsl_deadlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#include <sys/dsl_dataset.h>
Expand Down Expand Up @@ -163,12 +163,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)

for (zap_cursor_init(&zc, os, dlobj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc))
bpobj_free(os, za.za_first_integer, tx);
zap_cursor_advance(&zc)) {
uint64_t obj = za.za_first_integer;
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
bpobj_decr_empty(os, tx);
else
bpobj_free(os, obj, tx);
}
zap_cursor_fini(&zc);
VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
}

static void
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
const blkptr_t *bp, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
}

static void
dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
} else {
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
}

void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -197,7 +234,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
else
dle = AVL_PREV(&dl->dl_tree, dle);
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
dle_enqueue(dl, dle, bp, tx);
}

/*
Expand All @@ -217,7 +254,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)

dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);

Expand All @@ -243,8 +280,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
dle_prev = AVL_PREV(&dl->dl_tree, dle);

bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
dle->dle_bpobj.bpo_object, tx);
dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);

avl_remove(&dl->dl_tree, dle);
bpobj_close(&dle->dle_bpobj);
Expand Down Expand Up @@ -302,7 +338,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;

obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
Expand Down Expand Up @@ -400,7 +436,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
dle_enqueue_subobj(dl, dle, obj, tx);
}

static int
Expand Down
9 changes: 9 additions & 0 deletions usr/src/uts/common/fs/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,15 @@ dsl_pool_open(dsl_pool_t *dp)
goto out;
}

if (spa_feature_is_active(dp->dp_spa,
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj);
if (err != 0)
goto out;
}

err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
&dp->dp_tmp_userrefs_obj);
Expand Down
3 changes: 3 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/bpobj.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPOBJ_H
Expand Down Expand Up @@ -67,7 +68,9 @@ typedef struct bpobj {
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);

int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
void bpobj_close(bpobj_t *bpo);
Expand Down
1 change: 1 addition & 0 deletions usr/src/uts/common/fs/zfs/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"

/*
* Allocate an object from this objset. The range of object numbers
Expand Down
1 change: 1 addition & 0 deletions usr/src/uts/common/fs/zfs/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ typedef struct dsl_pool {
uint64_t dp_tmp_userrefs_obj;
bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj;
uint64_t dp_empty_bpobj;

struct dsl_scan *dp_scan;

Expand Down
2 changes: 2 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/zap.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
/* Here the key is an int and the value is a different int. */
int zap_add_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t *valuep);

Expand Down
10 changes: 10 additions & 0 deletions usr/src/uts/common/fs/zfs/zap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
return (zap_add(os, obj, name, 8, 1, &value, tx));
}

int
zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx)
{
char name[20];

(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
return (zap_update(os, obj, name, 8, 1, &value, tx));
}

int
zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
{
Expand Down
7 changes: 6 additions & 1 deletion usr/src/uts/common/fs/zfs/zfeature.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,12 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
uint64_t refcount;
uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;

ASSERT(0 != zapobj);
/*
* If the pool is currently being created, the feature objects may not
* have been allocated yet. Act as though all features are disabled.
*/
if (zapobj == 0)
return (ENOTSUP);

err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
&refcount);
Expand Down

0 comments on commit f174573

Please sign in to comment.