diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index e4435e0caa1b..c2c548925b23 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -92,6 +92,7 @@ void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); +void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 08b8c4a26e7e..f32158f0c044 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1385,11 +1385,6 @@ dsl_scan_prefetch_thread(void *arg) /* loop until we are told to stop */ while (!scn->scn_prefetch_stop) { - /* - * XXX These are prefetches, but unfortunately the ARC - * currently expects prefetches to not have a "done" - * function, which we need. - */ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_LONG_LIFE | ARC_FLAG_PREFETCH; @@ -1434,6 +1429,7 @@ dsl_scan_prefetch_thread(void *arg) mutex_enter(&scn->scn_prefetch_lock); while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); @@ -3425,6 +3421,7 @@ scan_io_queue_insert_cb(range_tree_t *rt, range_seg_t *rs, void *arg) { dsl_scan_io_queue_t *queue = arg; avl_index_t idx; + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); VERIFY3P(avl_find(&queue->q_exts_by_size, rs, &idx), ==, NULL); avl_insert(&queue->q_exts_by_size, rs, idx); @@ -3436,6 +3433,8 @@ static void scan_io_queue_remove_cb(range_tree_t *rt, range_seg_t *rs, void *arg) { dsl_scan_io_queue_t *queue = arg; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); avl_remove(&queue->q_exts_by_size, rs); } @@ -3446,7 +3445,10 @@ scan_io_queue_vacate_cb(range_tree_t *rt, void *arg) { dsl_scan_io_queue_t *queue = arg; void *cookie = NULL; - while (avl_destroy_nodes(&queue->q_exts_by_size, &cookie) != NULL); + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + while (avl_destroy_nodes(&queue->q_exts_by_size, &cookie) != NULL) + ; } /* @@ -3671,27 +3673,30 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) sio = avl_find(&queue->q_zios_by_addr, &srch, &idx); if (sio != NULL) { range_seg_t *rs; + blkptr_t tmpbp; + uint64_t start = SCAN_IO_GET_OFFSET(sio); + int64_t size = sio->sio_asize; /* Got it while it was cold in the queue */ - ASSERT3U(srch.sio_asize, ==, sio->sio_asize); - count_block(scn, dp->dp_blkstats, bp); - ASSERT(range_tree_contains(queue->q_exts_by_addr, offset, - asize)); + ASSERT3U(srch.sio_asize, ==, size); avl_remove(&queue->q_zios_by_addr, sio); + rs = range_tree_find(queue->q_exts_by_addr, start, size); + ASSERT(rs != NULL); + ASSERT3U(rs->rs_fill, >=, size); + /* - * Since we're taking this scan_io_t out of its parent - * range_seg_t, we need to alter the range_seg_t's rs_fill - * value, which might change its place in q_exts_by_size. + * If this sio consitutes everything in the segment, simply + * remove it. Otherwise, we adjust the fill so that this + * segment is appropriately un-prioritized in q_exts_by_addr. */ - rs = range_tree_find(queue->q_exts_by_addr, - SCAN_IO_GET_OFFSET(sio), sio->sio_asize); - ASSERT(rs != NULL); - ASSERT3U(rs->rs_fill, >=, sio->sio_asize); - avl_remove(&queue->q_exts_by_size, rs); - rs->rs_fill -= sio->sio_asize; - VERIFY3P(avl_find(&queue->q_exts_by_size, rs, &idx), ==, NULL); - avl_insert(&queue->q_exts_by_size, rs, idx); + if (rs->rs_fill == size) { + range_tree_remove(queue->q_exts_by_addr, + rs->rs_start, rs->rs_end - rs->rs_start); + } else { + range_tree_adjust_fill(queue->q_exts_by_addr, + rs, -size); + } /* * We only update the queue byte counter in the cold path, @@ -3706,14 +3711,17 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) scn->scn_bytes_pending -= asize; mutex_exit(&scn->scn_status_lock); + sio2bp(sio, &tmpbp, dva_i); + count_block(scn, dp->dp_blkstats, &tmpbp); + kmem_free(sio, sizeof (*sio)); } else { /* * If it's part of an extent that's currently being issued or * it isn't in the queue at all, wait until the extent has been * consumed. In this case it's not us who is dequeueing this - * zio, so no need to decrement its size from scn_bytes_pending - * or the queue. + * zio, so we don't need to decrement its size from + * scn_bytes_pending or the queue. */ while (queue->q_issuing_rs.rs_start <= offset && queue->q_issuing_rs.rs_end >= offset + asize) { diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 462aadafa540..cc8e90dbb2c3 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -212,7 +212,7 @@ range_tree_destroy(range_tree_t *rt) } void -range_tree_add(void *arg, uint64_t start, uint64_t size) +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; avl_index_t where; @@ -222,7 +222,8 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) boolean_t merge_before, merge_after; ASSERT(MUTEX_HELD(rt->rt_lock)); - VERIFY(size != 0); + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); rsearch.rs_start = start; rsearch.rs_end = end; @@ -239,15 +240,39 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) /* * If this is a gap-supporting range tree, it is possible that we * are inserting into an existing segment. In this case simply - * bump the fill count and call the remove / add callbacks. + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. */ if (rs != NULL) { ASSERT3U(gap, !=, 0); + if (rs->rs_start <= start && rs->rs_end >= end) { + if (rt->rt_ops != NULL && + rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += size; + if (rt->rt_ops != NULL && + rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + return; + } + + avl_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += size; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rs->rs_end - rs->rs_start; + + fill = rs->rs_fill + size; + start = MIN(start, rs->rs_start); + end = MAX(end, rs->rs_end); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + + kmem_cache_free(range_seg_cache, rs); return; } @@ -280,7 +305,7 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs_after); if (gap != 0) - rs_after->rs_fill += rs_before->rs_fill + size; + rs_after->rs_fill += rs_before->rs_fill + fill; rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; @@ -291,7 +316,7 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs_before); if (gap != 0) - rs_before->rs_fill += size; + rs_before->rs_fill += fill; rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { @@ -301,19 +326,20 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs_after); if (gap != 0) - rs_after->rs_fill += size; + rs_after->rs_fill += fill; rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); if (gap != 0) - rs->rs_fill = size; + rs->rs_fill = fill; rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } + //TODO: fails occasionally if (gap != 0) ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); @@ -324,6 +350,12 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) rt->rt_space += size + bridge_size; } +void +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_add_impl(arg, start, size, size); +} + void range_tree_remove(void *arg, uint64_t start, uint64_t size) { @@ -354,6 +386,7 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) * This allows us to maintain accurate fill accounting and to ensure * that bridged sections are not leaked. */ + //TODO: fails occasionally if (rt->rt_gap != 0 && (rs->rs_start != start || rs->rs_end != end)) { zfs_panic_recover("zfs: freeing partial segment of gap tree " "(offset=%llu size=%llu) of (offset=%llu size=%llu)", @@ -405,6 +438,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) rt->rt_space -= size; } +void +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +{ + ASSERT(MUTEX_HELD(rt->rt_lock)); + ASSERT3U(rs->rs_fill + delta, !=, 0); + ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += delta; + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) {