Skip to content

Commit

Permalink
Illumos #4374
Browse files Browse the repository at this point in the history
4374 dn_free_ranges should use range_tree_t

Reviewed by: George Wilson <[email protected]>
Reviewed by: Max Grossman <[email protected]>
Reviewed by: Christopher Siden <[email protected]
Reviewed by: Garrett D'Amore <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
Approved by: Dan McDonald <[email protected]>

References:
  https://www.illumos.org/issues/4374
  illumos/illumos-gate@bf16b11

Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2531
  • Loading branch information
ahrens authored and behlendorf committed Jul 30, 2014
1 parent da53684 commit 9bd274d
Show file tree
Hide file tree
Showing 19 changed files with 128 additions and 174 deletions.
5 changes: 3 additions & 2 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#include <stdio.h>
Expand Down Expand Up @@ -2788,7 +2788,8 @@ dump_simulated_ddt(spa_t *spa)
dds.dds_ref_psize = zdde->zdde_ref_psize;
dds.dds_ref_dsize = zdde->zdde_ref_dsize;

ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
&dds, 0);

umem_free(zdde, sizeof (*zdde));
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/zpool/zpool_vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
int sector_size;

if (check_sector_size_database(path, &sector_size) == B_TRUE)
ashift = highbit(sector_size) - 1;
ashift = highbit64(sector_size) - 1;
}

if (ashift > 0)
Expand Down
6 changes: 2 additions & 4 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#ifndef _SYS_DNODE_H
Expand Down Expand Up @@ -220,7 +220,7 @@ typedef struct dnode {
/* protected by dn_mtx: */
kmutex_t dn_mtx;
list_t dn_dirty_records[TXG_SIZE];
avl_tree_t dn_ranges[TXG_SIZE];
struct range_tree *dn_free_ranges[TXG_SIZE];
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;
uint64_t dn_assigned_txg;
Expand Down Expand Up @@ -302,8 +302,6 @@ void dnode_buf_byteswap(void *buf, size_t size);
void dnode_verify(dnode_t *dn);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
void dnode_clear_range(dnode_t *dn, uint64_t blkid,
uint64_t nblks, dmu_tx_t *tx);
void dnode_diduse_space(dnode_t *dn, int64_t space);
void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
Expand Down
3 changes: 2 additions & 1 deletion include/sys/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/

#ifndef _SYS_RANGE_TREE_H
Expand Down Expand Up @@ -85,6 +85,7 @@ void range_tree_stat_verify(range_tree_t *rt);

void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);

void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
Expand Down
4 changes: 2 additions & 2 deletions include/sys/zfs_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#ifndef _SYS_ZFS_CONTEXT_H
Expand Down Expand Up @@ -623,7 +623,7 @@ extern void delay(clock_t ticks);

extern uint64_t physmem;

extern int highbit(ulong_t i);
extern int highbit64(uint64_t i);
extern int random_get_bytes(uint8_t *ptr, size_t len);
extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);

Expand Down
6 changes: 2 additions & 4 deletions lib/libzpool/kernel.c
Original file line number Diff line number Diff line change
Expand Up @@ -1014,17 +1014,15 @@ delay(clock_t ticks)
* High order bit is 31 (or 63 in _LP64 kernel).
*/
int
highbit(ulong_t i)
highbit64(uint64_t i)
{
register int h = 1;

if (i == 0)
return (0);
#ifdef _LP64
if (i & 0xffffffff00000000ul) {
if (i & 0xffffffff00000000ULL) {
h += 32; i >>= 32;
}
#endif
if (i & 0xffff0000) {
h += 16; i >>= 16;
}
Expand Down
8 changes: 6 additions & 2 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/

Expand All @@ -40,6 +40,7 @@
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/range_tree.h>

struct dbuf_hold_impl_data {
/* Function arguments */
Expand Down Expand Up @@ -1234,7 +1235,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
if (dn->dn_free_ranges[txgoff] != NULL) {
range_tree_clear(dn->dn_free_ranges[txgoff],
db->db_blkid, 1);
}
mutex_exit(&dn->dn_mtx);
db->db_freed_in_flight = FALSE;
}
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -423,7 +423,7 @@ ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)

ddt_stat_generate(ddt, dde, &dds);

bucket = highbit(dds.dds_ref_blocks) - 1;
bucket = highbit64(dds.dds_ref_blocks) - 1;
ASSERT(bucket >= 0);

ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
Expand Down
125 changes: 20 additions & 105 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand All @@ -35,8 +35,7 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>

static int free_range_compar(const void *node1, const void *node2);
#include <sys/range_tree.h>

static kmem_cache_t *dnode_cache;
/*
Expand Down Expand Up @@ -92,9 +91,7 @@ dnode_cons(void *arg, void *unused, int kmflag)

for (i = 0; i < TXG_SIZE; i++) {
list_link_init(&dn->dn_dirty_link[i]);
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
Expand Down Expand Up @@ -142,7 +139,7 @@ dnode_dest(void *arg, void *unused)

for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
avl_destroy(&dn->dn_ranges[i]);
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
Expand Down Expand Up @@ -313,19 +310,6 @@ dnode_buf_byteswap(void *vbuf, size_t size)
}
}

static int
free_range_compar(const void *node1, const void *node2)
{
const free_range_t *rp1 = node1;
const free_range_t *rp2 = node2;

if (rp1->fr_blkid < rp2->fr_blkid)
return (-1);
else if (rp1->fr_blkid > rp2->fr_blkid)
return (1);
else return (0);
}

void
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -374,7 +358,7 @@ dnode_setdblksz(dnode_t *dn, int size)
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
dn->dn_datablksz = size;
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
}

static dnode_t *
Expand Down Expand Up @@ -530,7 +514,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_next_blksz[i]);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}

dn->dn_type = ot;
Expand Down Expand Up @@ -695,7 +679,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
Expand Down Expand Up @@ -758,8 +743,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
odn->dn_ranges[i].avl_root = NULL;
odn->dn_ranges[i].avl_numnodes = 0;
odn->dn_free_ranges[i] = NULL;
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
Expand Down Expand Up @@ -1462,59 +1446,6 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
rw_downgrade(&dn->dn_struct_rwlock);
}

void
dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
{
avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
avl_index_t where;
free_range_t *rp;
free_range_t rp_tofind;
uint64_t endblk = blkid + nblks;

ASSERT(MUTEX_HELD(&dn->dn_mtx));
ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */

dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
blkid, nblks, tx->tx_txg);
rp_tofind.fr_blkid = blkid;
rp = avl_find(tree, &rp_tofind, &where);
if (rp == NULL)
rp = avl_nearest(tree, where, AVL_BEFORE);
if (rp == NULL)
rp = avl_nearest(tree, where, AVL_AFTER);

while (rp && (rp->fr_blkid <= blkid + nblks)) {
uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
free_range_t *nrp = AVL_NEXT(tree, rp);

if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
/* clear this entire range */
avl_remove(tree, rp);
kmem_free(rp, sizeof (free_range_t));
} else if (blkid <= rp->fr_blkid &&
endblk > rp->fr_blkid && endblk < fr_endblk) {
/* clear the beginning of this range */
rp->fr_blkid = endblk;
rp->fr_nblks = fr_endblk - endblk;
} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
endblk >= fr_endblk) {
/* clear the end of this range */
rp->fr_nblks = blkid - rp->fr_blkid;
} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
/* clear a chunk out of this range */
free_range_t *new_rp =
kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);

new_rp->fr_blkid = endblk;
new_rp->fr_nblks = fr_endblk - endblk;
avl_insert_here(tree, new_rp, rp, AVL_AFTER);
rp->fr_nblks = blkid - rp->fr_blkid;
}
/* there may be no overlap */
rp = nrp;
}
}

void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -1665,22 +1596,17 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
* We will finish up this free operation in the syncing phase.
*/
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, blkid, nblks, tx);
{
free_range_t *rp, *found;
avl_index_t where;
avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];

/* Add new range to dn_ranges */
rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
rp->fr_blkid = blkid;
rp->fr_nblks = nblks;
found = avl_find(tree, rp, &where);
ASSERT(found == NULL);
avl_insert(tree, rp, where);
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
blkid, nblks, tx->tx_txg);
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
dn->dn_free_ranges[txgoff] =
range_tree_create(NULL, NULL, &dn->dn_mtx);
}
range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
}
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
blkid, nblks, tx->tx_txg);
mutex_exit(&dn->dn_mtx);

dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
Expand Down Expand Up @@ -1708,7 +1634,6 @@ dnode_spill_freed(dnode_t *dn)
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
free_range_t range_tofind;
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;

Expand All @@ -1728,20 +1653,10 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));

range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
free_range_t *range_found;
avl_index_t idx;

range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
if (range_found) {
ASSERT(range_found->fr_nblks > 0);
break;
}
range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
if (range_found &&
range_found->fr_blkid + range_found->fr_nblks > blkid)
if (dn->dn_free_ranges[i] != NULL &&
range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
break;
}
mutex_exit(&dn->dn_mtx);
Expand Down
Loading

0 comments on commit 9bd274d

Please sign in to comment.