Skip to content

Commit

Permalink
9438 Holes can lose birth time info if a block has a mix of birth times
Browse files Browse the repository at this point in the history
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>

As reported by openzfs/zfs#4996, there is
yet another hole birth issue. In this one, if a block is entirely holes,
but the birth times are not all the same, we lose that information by
creating one hole with the current txg as its birth time.

The ZoL PR's fix approach is incorrect. Ultimately, the problem here is
that when you truncate and write a file in the same transaction group,
the dbuf for the indirect block will be zeroed out to deal with the
truncation, and then written for the write. During this process, we will
lose hole birth time information for any holes in the range. In the case
where a dnode is being freed, we need to determine whether the block
should be converted to a higher-level hole in the zio pipeline, and if
so do it when the dnode is being synced out.

Closes openzfs#611
  • Loading branch information
pcd1193182 authored and Prakash Surya committed Jun 18, 2018
1 parent 9196d4c commit 12ac60a
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 25 deletions.
4 changes: 4 additions & 0 deletions usr/src/uts/common/fs/zfs/dmu_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);

ASSERT(dn->dn_type != DMU_OT_NONE);
/*
* If we don't create this free range, we'll leak indirect blocks when
* we get to freeing the dnode in syncing context.
*/
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
Expand Down
68 changes: 68 additions & 0 deletions usr/src/uts/common/fs/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
}
}

/*
* Dirty all the in-core level-1 dbufs in the range specified by start_blkid
* and end_blkid.
*/
static void
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
dmu_buf_impl_t db_search;
dmu_buf_impl_t *db;
avl_index_t where;

mutex_enter(&dn->dn_dbufs_mtx);

db_search.db_level = 1;
db_search.db_blkid = start_blkid + 1;
db_search.db_state = DB_SEARCH;
for (;;) {

db = avl_find(&dn->dn_dbufs, &db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

if (db == NULL || db->db_level != 1 ||
db->db_blkid >= end_blkid) {
break;
}

/*
* Setup the next blkid we want to search for.
*/
db_search.db_blkid = db->db_blkid + 1;
ASSERT3U(db->db_blkid, >=, start_blkid);

/*
* If the dbuf transitions to DB_EVICTING while we're trying
* to dirty it, then we will be unable to discover it in
* the dbuf hash table. This will result in a call to
* dbuf_create() which needs to acquire the dn_dbufs_mtx
* lock. To avoid a deadlock, we drop the lock before
* dirtying the level-1 dbuf.
*/
mutex_exit(&dn->dn_dbufs_mtx);
dnode_dirty_l1(dn, db->db_blkid, tx);
mutex_enter(&dn->dn_dbufs_mtx);
}

#ifdef ZFS_DEBUG
/*
* Walk all the in-core level-1 dbufs and verify they have been dirtied.
*/
db_search.db_level = 1;
db_search.db_blkid = start_blkid + 1;
db_search.db_state = DB_SEARCH;
db = avl_find(&dn->dn_dbufs, &db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_level != 1 || db->db_blkid >= end_blkid)
break;
ASSERT(db->db_dirtycnt > 0);
}
#endif
mutex_exit(&dn->dn_dbufs_mtx);
}

void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -1658,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (last != first)
dnode_dirty_l1(dn, last, tx);

dnode_dirty_l1range(dn, first, last, tx);

int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
Expand Down
54 changes: 30 additions & 24 deletions usr/src/uts/common/fs/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif

/*
* We don't usually free the indirect blocks here. If in one txg we have a
* free_range and a write to the same indirect block, it's important that we
* preserve the hole's birth times. Therefore, we don't free any any indirect
* blocks in free_children(). If an indirect block happens to turn into all
* holes, it will be freed by dbuf_write_children_ready, which happens at a
* point in the syncing process where we know for certain the contents of the
* indirect block.
*
* However, if we're freeing a dnode, its space accounting must go to zero
* before we actually try to free the dnode, or we will trip an assertion. In
* addition, we know the case described above cannot occur, because the dnode is
* being freed. Therefore, we free the indirect blocks immediately in that
* case.
*/
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
boolean_t free_indirects, dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
Expand Down Expand Up @@ -283,32 +298,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);

free_children(subdb, blkid, nblks, tx);
free_children(subdb, blkid, nblks, free_indirects, tx);
dbuf_rele(subdb, FTAG);
}
}

/* If this whole block is free, free ourself too. */
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
break;
}
if (i == 1 << epbs) {
/*
* We only found holes. Grab the rwlock to prevent
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (free_indirects) {
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
rw_exit(&dn->dn_struct_rwlock);
free_blocks(dn, db->db_blkptr, 1, tx);
} else {
/*
* Partial block free; must be marked dirty so that it
* will be written out.
*/
ASSERT(db->db_dirtycnt > 0);
}

DB_DNODE_EXIT(db);
Expand All @@ -321,7 +320,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
*/
static void
dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
boolean_t free_indirects, dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
int dnlevel = dn->dn_phys->dn_nlevels;
Expand Down Expand Up @@ -361,7 +360,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);

free_children(db, blkid, nblks, tx);
free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
}
Expand All @@ -380,6 +379,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;

static void
Expand All @@ -389,7 +389,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
dnode_t *dn = dsfra->dsfra_dnode;

mutex_exit(&dn->dn_mtx);
dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
dnode_sync_free_range_impl(dn, blkid, nblks,
dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}

Expand Down Expand Up @@ -670,6 +671,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
dsfra.dsfra_free_indirects = freeing_dnode;
if (freeing_dnode) {
ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
0, dn->dn_maxblkid + 1));
}
mutex_enter(&dn->dn_mtx);
range_tree_vacate(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
Expand Down
3 changes: 2 additions & 1 deletion usr/src/uts/common/fs/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1603,7 +1603,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
return (0);
}

error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
DMU_OBJECT_END);
if (error) {
zfs_range_unlock(rl);
return (error);
Expand Down

0 comments on commit 12ac60a

Please sign in to comment.