Skip to content

Commit

Permalink
Fast Clone Deletion
Browse files Browse the repository at this point in the history
Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.

This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.

We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.

Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Signed-off-by: Sara Hartse <[email protected]>
Closes #8416
  • Loading branch information
Sara Hartse authored and behlendorf committed Jul 26, 2019
1 parent d274ac5 commit 37f03da
Show file tree
Hide file tree
Showing 38 changed files with 2,581 additions and 203 deletions.
330 changes: 267 additions & 63 deletions cmd/zdb/zdb.c

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions include/sys/bplist.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPLIST_H
Expand Down Expand Up @@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl);
void bplist_append(bplist_t *bpl, const blkptr_t *bp);
void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
void *arg, dmu_tx_t *tx);
void bplist_clear(bplist_t *bpl);

#ifdef __cplusplus
}
Expand Down
19 changes: 15 additions & 4 deletions include/sys/bpobj.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2015, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPOBJ_H
Expand All @@ -31,6 +31,7 @@
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/bplist.h>

#ifdef __cplusplus
extern "C" {
Expand All @@ -48,10 +49,12 @@ typedef struct bpobj_phys {
uint64_t bpo_uncomp;
uint64_t bpo_subobjs;
uint64_t bpo_num_subobjs;
uint64_t bpo_num_freed;
} bpobj_phys_t;

#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))

typedef struct bpobj {
kmutex_t bpo_lock;
Expand All @@ -60,12 +63,14 @@ typedef struct bpobj {
int bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
uint8_t bpo_havefreed;
bpobj_phys_t *bpo_phys;
dmu_buf_t *bpo_dbuf;
dmu_buf_t *bpo_cached_dbuf;
} bpobj_t;

typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);

uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
Expand All @@ -77,17 +82,23 @@ void bpobj_close(bpobj_t *bpo);
boolean_t bpobj_is_open(const bpobj_t *bpo);

int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *);
int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
void *arg, int64_t start);

void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);

int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t bpobj_is_empty(bpobj_t *bpo);

int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 2 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ typedef struct dmu_buf {
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"

/*
* Allocate an object from this objset. The range of object numbers
Expand Down Expand Up @@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os);
extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_objset_blksize(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
Expand Down
2 changes: 1 addition & 1 deletion include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
uint64_t os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before
Expand Down
31 changes: 29 additions & 2 deletions include/sys/dsl_deadlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,22 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
* Copyright (c) 2018, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_DSL_DEADLIST_H
#define _SYS_DSL_DEADLIST_H

#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/zthr.h>

#ifdef __cplusplus
extern "C" {
#endif

struct dmu_buf;
struct dsl_pool;
struct dsl_dataset;

typedef struct dsl_deadlist_phys {
Expand Down Expand Up @@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry {
bpobj_t dle_bpobj;
} dsl_deadlist_entry_t;

typedef struct livelist_condense_entry {
struct dsl_dataset *ds;
dsl_deadlist_entry_t *first;
dsl_deadlist_entry_t *next;
boolean_t syncing;
boolean_t cancelled;
} livelist_condense_entry_t;

extern unsigned long zfs_livelist_max_entries;
extern int zfs_livelist_min_percent_shared;

typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle);

void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
void dsl_deadlist_close(dsl_deadlist_t *dl);
void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg);
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp,
boolean_t free, dmu_tx_t *tx);
int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg,
dmu_tx_t *tx);
dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl);
dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl);
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx);
void dsl_deadlist_space(dsl_deadlist_t *dl,
Expand All @@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
zthr_t *t, uint64_t *size);
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx);

#ifdef __cplusplus
}
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_destroy.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ extern "C" {

struct nvlist;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;

int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
Expand Down
14 changes: 12 additions & 2 deletions include/sys/dsl_dir.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
Expand All @@ -29,18 +29,20 @@
#define _SYS_DSL_DIR_H

#include <sys/dmu.h>
#include <sys/dsl_deadlist.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/refcount.h>
#include <sys/zfs_context.h>
#include <sys/dsl_crypt.h>
#include <sys/bplist.h>

#ifdef __cplusplus
extern "C" {
#endif

struct dsl_dataset;

struct zthr;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
* They should be of the format <reverse-dns>:<field>.
Expand All @@ -49,6 +51,7 @@ struct dsl_dataset;
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
#define DD_FIELD_LIVELIST "com.delphix:livelist"

typedef enum dd_used {
DD_USED_HEAD,
Expand Down Expand Up @@ -114,6 +117,10 @@ struct dsl_dir {
/* amount of space we expect to write; == amount of dirty data */
int64_t dd_space_towrite[TXG_SIZE];

dsl_deadlist_t dd_livelist;
bplist_t dd_pending_frees;
bplist_t dd_pending_allocs;

/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
Expand Down Expand Up @@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj);
void dsl_dir_livelist_close(dsl_dir_t *dd);
void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total);

/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
Expand Down
3 changes: 2 additions & 1 deletion include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/

Expand Down Expand Up @@ -54,6 +54,7 @@ struct dsl_pool;
struct dmu_tx;
struct dsl_scan;
struct dsl_crypto_params;
struct dsl_deadlist;

extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;
Expand Down
9 changes: 9 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
typedef struct zbookmark_phys zbookmark_phys_t;

struct bpobj;
struct bplist;
struct dsl_pool;
struct dsl_dataset;
struct dsl_crypto_params;
Expand Down Expand Up @@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)

#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)

#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
Expand Down Expand Up @@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0)
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/

#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
Expand Down Expand Up @@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
Expand Down Expand Up @@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);

extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
Expand Down
6 changes: 6 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include <sys/dsl_crypt.h>
#include <sys/zfeature.h>
#include <sys/zthr.h>
#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>

#ifdef __cplusplus
Expand Down Expand Up @@ -317,6 +318,11 @@ struct spa {
list_t spa_log_summary;
uint64_t spa_log_flushall_txg;

zthr_t *spa_livelist_delete_zthr; /* deleting livelists */
zthr_t *spa_livelist_condense_zthr; /* condensing livelists */
uint64_t spa_livelists_to_delete; /* set of livelists to free */
livelist_condense_entry_t spa_to_condense; /* next to condense */

char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zthr.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t);
extern void zthr_wakeup(zthr_t *t);
extern void zthr_cancel(zthr_t *t);
extern void zthr_resume(zthr_t *t);
extern void zthr_wait_cycle_done(zthr_t *t);

extern boolean_t zthr_iscancelled(zthr_t *t);
extern boolean_t zthr_has_waiters(zthr_t *t);

#endif /* _SYS_ZTHR_H */
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ typedef enum spa_feature {
SPA_FEATURE_REDACTED_DATASETS,
SPA_FEATURE_BOOKMARK_WRITTEN,
SPA_FEATURE_LOG_SPACEMAP,
SPA_FEATURE_LIVELIST,
SPA_FEATURES
} spa_feature_t;

Expand Down
Loading

0 comments on commit 37f03da

Please sign in to comment.