Skip to content

Commit

Permalink
Parallelize vdev_load
Browse files Browse the repository at this point in the history
metaslab_init is the slowest part of importing a mature pool, and it
must be repeated hundreds of times for each top-level vdev.  But its
speed is dominated by a few serialized disk accesses.  That can lead to
import times of > 1 hour for pools with many top-level vdevs on spinny
disks.

Speed up the import by using a taskqueue to parallelize vdev_load across
all top-level vdevs.

This also requires adding mutex protection to
metaslab_class_t.mc_historgram.  The mc_histogram fields were
unprotected when that code was first written in "Illumos 4976-4984 -
metaslab improvements" (OpenZFS
f3a7f66).  The lock wasn't added until
3dfb57a, though it's unclear exactly
which fields it's supposed to protect.  In any case, it wasn't until
vdev_load was parallelized that any code attempted concurrent access to
those fields.

Sponsored by: Axcient
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Alan Somers <[email protected]>
Closes #11470
  • Loading branch information
asomers authored and behlendorf committed Jan 27, 2021
1 parent dfb44c5 commit a0e0199
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 3 deletions.
1 change: 1 addition & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ struct vdev {
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
boolean_t vdev_nonrot; /* true if solid state */
int vdev_load_error; /* error on last load */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
uint64_t vdev_crtxg; /* txg when top-level was added */
Expand Down
6 changes: 6 additions & 0 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
KM_SLEEP);

mutex_enter(&mc->mc_lock);
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = vdev_get_mg(tvd, mc);
Expand All @@ -546,6 +547,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
}

mutex_exit(&mc->mc_lock);
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}

Expand Down Expand Up @@ -1067,6 +1069,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
return;

mutex_enter(&mg->mg_lock);
mutex_enter(&mc->mc_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
IMPLY(mg == mg->mg_vd->vdev_log_mg,
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
Expand All @@ -1075,6 +1078,7 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
mc->mc_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mc->mc_lock);
mutex_exit(&mg->mg_lock);
}

Expand All @@ -1089,6 +1093,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
return;

mutex_enter(&mg->mg_lock);
mutex_enter(&mc->mc_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
ASSERT3U(mg->mg_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
Expand All @@ -1102,6 +1107,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
mc->mc_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mc->mc_lock);
mutex_exit(&mg->mg_lock);
}

Expand Down
42 changes: 39 additions & 3 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1724,6 +1724,14 @@ vdev_probe(vdev_t *vd, zio_t *zio)
return (NULL);
}

static void
vdev_load_child(void *arg)
{
vdev_t *vd = arg;

vd->vdev_load_error = vdev_load(vd);
}

static void
vdev_open_child(void *arg)
{
Expand Down Expand Up @@ -3350,18 +3358,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
int
vdev_load(vdev_t *vd)
{
int children = vd->vdev_children;
int error = 0;
taskq_t *tq = NULL;

/*
* It's only worthwhile to use the taskq for the root vdev, because the
* slow part is metaslab_init, and that only happens for top-level
* vdevs.
*/
if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
tq = taskq_create("vdev_load", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
}

/*
* Recursively load all children.
*/
for (int c = 0; c < vd->vdev_children; c++) {
error = vdev_load(vd->vdev_child[c]);
if (error != 0) {
return (error);
vdev_t *cvd = vd->vdev_child[c];

if (tq == NULL || vdev_uses_zvols(cvd)) {
cvd->vdev_load_error = vdev_load(cvd);
} else {
VERIFY(taskq_dispatch(tq, vdev_load_child,
cvd, TQ_SLEEP) != TASKQID_INVALID);
}
}

if (tq != NULL) {
taskq_wait(tq);
taskq_destroy(tq);
}

for (int c = 0; c < vd->vdev_children; c++) {
int error = vd->vdev_child[c]->vdev_load_error;

if (error != 0)
return (error);
}

vdev_set_deflate_ratio(vd);

/*
Expand Down

0 comments on commit a0e0199

Please sign in to comment.