Skip to content

Commit

Permalink
Log Spacemap (openzfs#43)
Browse files Browse the repository at this point in the history
Signed-off-by: Serapheim Dimitropoulos <[email protected]>
  • Loading branch information
sdimitro authored and ahrens committed Apr 23, 2019
1 parent 2036c30 commit b06927c
Show file tree
Hide file tree
Showing 34 changed files with 3,109 additions and 317 deletions.
390 changes: 352 additions & 38 deletions cmd/zdb/zdb.c

Large diffs are not rendered by default.

33 changes: 15 additions & 18 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -2950,24 +2950,12 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
return (NULL);
}

/*
* Find the first available hole which can be used as a top-level.
*/
int
find_vdev_hole(spa_t *spa)
static int
spa_num_top_vdevs(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
int c;

ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);

for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];

if (cvd->vdev_ishole)
break;
}
return (c);
ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV);
return (rvd->vdev_children);
}

/*
Expand All @@ -2992,7 +2980,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;

/*
* If we have slogs then remove them 1/4 of the time.
Expand Down Expand Up @@ -3099,7 +3087,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
spa_config_exit(spa, SCL_VDEV, FTAG);

nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
Expand Down Expand Up @@ -7488,6 +7476,15 @@ ztest_init(ztest_shared_t *zs)

for (i = 0; i < SPA_FEATURES; i++) {
char *buf;

/*
* 75% chance of using the log space map feature. We want ztest
* to exercise both the code paths that use the log space map
* feature and the ones that don't.
*/
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;

VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
spa_feature_table[i].fi_uname));
VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
Expand Down
3 changes: 2 additions & 1 deletion include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ COMMON_H = \
$(top_srcdir)/include/sys/bptree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/cityhash.h \
$(top_srcdir)/include/sys/spa_checkpoint.h \
$(top_srcdir)/include/sys/dataset_kstats.h \
$(top_srcdir)/include/sys/dbuf.h \
$(top_srcdir)/include/sys/ddt.h \
Expand Down Expand Up @@ -63,6 +62,8 @@ COMMON_H = \
$(top_srcdir)/include/sys/sha2.h \
$(top_srcdir)/include/sys/skein.h \
$(top_srcdir)/include/sys/spa_boot.h \
$(top_srcdir)/include/sys/spa_checkpoint.h \
$(top_srcdir)/include/sys/spa_log_spacemap.h \
$(top_srcdir)/include/sys/space_map.h \
$(top_srcdir)/include/sys/space_reftree.h \
$(top_srcdir)/include/sys/spa.h \
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ typedef struct dmu_buf {
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"

/*
* Allocate an object from this objset. The range of object numbers
Expand Down
2 changes: 2 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,8 @@ typedef struct zpool_load_policy {
"com.delphix:obsolete_counts_are_precise"
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
"com.delphix:pool_checkpoint_sm"
#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
"com.delphix:ms_unflushed_phys_txgs"

#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
"org.zfsonlinux:allocation_bias"
Expand Down
11 changes: 11 additions & 0 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,16 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);

void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *);
void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *);
uint64_t metaslab_unflushed_txg(metaslab_t *);
uint64_t metaslab_estimated_condensed_size(metaslab_t *);
int metaslab_sort_by_flushed(const void *, const void *);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);

int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);

uint64_t metaslab_allocated_space(metaslab_t *);

Expand Down Expand Up @@ -107,6 +115,9 @@ uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
uint64_t metaslab_class_get_deferred(metaslab_class_t *);

void metaslab_space_update(vdev_t *, metaslab_class_t *,
int64_t, int64_t, int64_t);

metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
Expand Down
28 changes: 25 additions & 3 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_IMPL_H
Expand Down Expand Up @@ -357,7 +357,7 @@ struct metaslab {
* write to metaslab data on-disk (i.e flushing entries to
* the metaslab's space map). It helps coordinate readers of
* the metaslab's space map [see spa_vdev_remove_thread()]
* with writers [see metaslab_sync()].
* with writers [see metaslab_sync() or metaslab_flush()].
*
* Note that metaslab_load(), even though a reader, uses
* a completely different mechanism to deal with the reading
Expand Down Expand Up @@ -401,7 +401,6 @@ struct metaslab {

boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;

/*
* The number of consumers which have disabled the metaslab.
Expand All @@ -414,6 +413,8 @@ struct metaslab {
*/
boolean_t ms_loaded;
boolean_t ms_loading;
kcondvar_t ms_flush_cv;
boolean_t ms_flushing;

/*
* The following histograms count entries that are in the
Expand Down Expand Up @@ -499,13 +500,34 @@ struct metaslab {
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */

/*
* Allocs and frees that are committed to the vdev log spacemap but
* not yet to this metaslab's spacemap.
*/
range_tree_t *ms_unflushed_allocs;
range_tree_t *ms_unflushed_frees;

/*
* We have flushed entries up to but not including this TXG. In
* other words, all changes from this TXG and onward should not
* be in this metaslab's space map and must be read from the
* log space maps.
*/
uint64_t ms_unflushed_txg;

/* updated every time we are done syncing the metaslab's space map */
uint64_t ms_synced_length;

boolean_t ms_new;
};

typedef struct metaslab_unflushed_phys {
/* on-disk counterpart of ms_unflushed_txg */
uint64_t msp_unflushed_txg;
} metaslab_unflushed_phys_t;

#ifdef __cplusplus
}
#endif
Expand Down
8 changes: 7 additions & 1 deletion include/sys/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_RANGE_TREE_H
Expand Down Expand Up @@ -95,6 +95,7 @@ range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
uint64_t range_tree_numsegs(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
Expand All @@ -112,6 +113,11 @@ void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
range_seg_t *range_tree_first(range_tree_t *rt);

void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
range_tree_t *removefrom, range_tree_t *addto);
void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
range_tree_t *addto);

void rt_avl_create(range_tree_t *rt, void *arg);
void rt_avl_destroy(range_tree_t *rt, void *arg);
void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
Expand Down
5 changes: 4 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
Expand All @@ -42,6 +42,7 @@
#include <sys/fs/zfs.h>
#include <sys/spa_checksum.h>
#include <sys/dmu.h>
#include <sys/space_map.h>

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -1075,6 +1076,7 @@ extern boolean_t spa_suspended(spa_t *spa);
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern space_map_t *spa_syncing_log_sm(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
extern uint64_t spa_deadman_ziotime(spa_t *spa);
extern uint64_t spa_dirty_data(spa_t *spa);
Expand Down Expand Up @@ -1125,6 +1127,7 @@ extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
Expand Down
11 changes: 10 additions & 1 deletion include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
Expand All @@ -34,6 +34,7 @@

#include <sys/spa.h>
#include <sys/spa_checkpoint.h>
#include <sys/spa_log_spacemap.h>
#include <sys/vdev.h>
#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
Expand Down Expand Up @@ -313,6 +314,14 @@ struct spa {
uint64_t spa_livelists_to_delete; /* set of livelists to free */
livelist_condense_entry_t spa_to_condense; /* next to condense */

space_map_t *spa_syncing_log_sm; /* current log space map */
avl_tree_t spa_sm_logs_by_txg;
kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */
avl_tree_t spa_metaslabs_by_flushed;
spa_unflushed_stats_t spa_unflushed_stats;
list_t spa_log_summary;
uint64_t spa_log_flushall_txg;

char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
Expand Down
77 changes: 77 additions & 0 deletions include/sys/spa_log_spacemap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2018, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_SPA_LOG_SPACEMAP_H
#define _SYS_SPA_LOG_SPACEMAP_H

#include <sys/avl.h>

typedef struct log_summary_entry {
uint64_t lse_start; /* start TXG */
uint64_t lse_mscount; /* # of metaslabs needed to be flushed */
uint64_t lse_blkcount; /* blocks held by this entry */
list_node_t lse_node;
} log_summary_entry_t;

typedef struct spa_unflushed_stats {
/* used for memory heuristic */
uint64_t sus_memused; /* current memory used for unflushed trees */

/* used for block heuristic */
uint64_t sus_blocklimit; /* max # of log blocks allowed */
uint64_t sus_nblocks; /* # of blocks in log space maps currently */
} spa_unflushed_stats_t;

typedef struct spa_log_sm {
uint64_t sls_sm_obj; /* space map object ID */
uint64_t sls_txg; /* txg logged on the space map */
uint64_t sls_nblocks; /* number of blocks in this log */
uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */
avl_node_t sls_node; /* node in spa_sm_logs_by_txg */
} spa_log_sm_t;

int spa_ld_log_spacemaps(spa_t *);

void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *);
void spa_flush_metaslabs(spa_t *, dmu_tx_t *);
void spa_sync_close_syncing_log_sm(spa_t *);

void spa_cleanup_old_sm_logs(spa_t *, dmu_tx_t *);

uint64_t spa_log_sm_blocklimit(spa_t *);
void spa_log_sm_set_blocklimit(spa_t *);
uint64_t spa_log_sm_nblocks(spa_t *);
uint64_t spa_log_sm_memused(spa_t *);

void spa_log_sm_decrement_mscount(spa_t *, uint64_t);
void spa_log_sm_increment_current_mscount(spa_t *);

void spa_log_summary_add_flushed_metaslab(spa_t *);
void spa_log_summary_decrement_mscount(spa_t *, uint64_t);
void spa_log_summary_decrement_blkcount(spa_t *, uint64_t);

boolean_t spa_flush_all_logs_requested(spa_t *);

#endif /* _SYS_SPA_LOG_SPACEMAP_H */
9 changes: 8 additions & 1 deletion include/sys/space_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
*/

#ifndef _SYS_SPACE_MAP_H
Expand Down Expand Up @@ -72,6 +72,11 @@ typedef struct space_map_phys {
* bucket, smp_histogram[i], contains the number of free regions
* whose size is:
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
*
* Note that, if log space map feature is enabled, histograms of
* space maps that belong to metaslabs will take into account any
* unflushed changes for their metaslabs, even though the actual
* space map doesn't have entries for these changes.
*/
uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
} space_map_phys_t;
Expand Down Expand Up @@ -209,6 +214,8 @@ void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
uint64_t space_map_object(space_map_t *sm);
int64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt);
uint64_t space_map_nblocks(space_map_t *sm);

void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
uint64_t vdev_id, dmu_tx_t *tx);
Expand Down
Loading

0 comments on commit b06927c

Please sign in to comment.