Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenZFS 9438 - Holes can lose birth time info if a block has a mix of birth times #7746

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions module/zfs/dmu_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);

ASSERT(dn->dn_type != DMU_OT_NONE);
/*
* If we don't create this free range, we'll leak indirect blocks when
* we get to freeing the dnode in syncing context.
*/
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
Expand Down
74 changes: 70 additions & 4 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1889,6 +1889,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
}
}

/*
* Dirty all the in-core level-1 dbufs in the range specified by start_blkid
* and end_blkid.
*/
static void
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
dmu_buf_impl_t db_search;
dmu_buf_impl_t *db;
avl_index_t where;

mutex_enter(&dn->dn_dbufs_mtx);

db_search.db_level = 1;
db_search.db_blkid = start_blkid + 1;
db_search.db_state = DB_SEARCH;
for (;;) {

db = avl_find(&dn->dn_dbufs, &db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

if (db == NULL || db->db_level != 1 ||
db->db_blkid >= end_blkid) {
break;
}

/*
* Setup the next blkid we want to search for.
*/
db_search.db_blkid = db->db_blkid + 1;
ASSERT3U(db->db_blkid, >=, start_blkid);

/*
* If the dbuf transitions to DB_EVICTING while we're trying
* to dirty it, then we will be unable to discover it in
* the dbuf hash table. This will result in a call to
* dbuf_create() which needs to acquire the dn_dbufs_mtx
* lock. To avoid a deadlock, we drop the lock before
* dirtying the level-1 dbuf.
*/
mutex_exit(&dn->dn_dbufs_mtx);
dnode_dirty_l1(dn, db->db_blkid, tx);
mutex_enter(&dn->dn_dbufs_mtx);
}

#ifdef ZFS_DEBUG
/*
* Walk all the in-core level-1 dbufs and verify they have been dirtied.
*/
db_search.db_level = 1;
db_search.db_blkid = start_blkid + 1;
db_search.db_state = DB_SEARCH;
db = avl_find(&dn->dn_dbufs, &db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_level != 1 || db->db_blkid >= end_blkid)
break;
ASSERT(db->db_dirtycnt > 0);
}
#endif
mutex_exit(&dn->dn_dbufs_mtx);
}

void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -1921,13 +1987,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (off == 0 && len >= blksz) {
/*
* Freeing the whole block; fast-track this request.
* Note that we won't dirty any indirect blocks,
* which is fine because we will be freeing the entire
* file and thus all indirect blocks will be freed
* by free_children().
*/
blkid = 0;
nblks = 1;
if (dn->dn_nlevels > 1)
dnode_dirty_l1(dn, 0, tx);
goto done;
} else if (off >= blksz) {
/* Freeing past end-of-data */
Expand Down Expand Up @@ -2040,6 +2104,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (last != first)
dnode_dirty_l1(dn, last, tx);

dnode_dirty_l1range(dn, first, last, tx);

int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
Expand Down
72 changes: 48 additions & 24 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif

/*
* We don't usually free the indirect blocks here. If in one txg we have a
* free_range and a write to the same indirect block, it's important that we
* preserve the hole's birth times. Therefore, we don't free any any indirect
* blocks in free_children(). If an indirect block happens to turn into all
* holes, it will be freed by dbuf_write_children_ready, which happens at a
* point in the syncing process where we know for certain the contents of the
* indirect block.
*
* However, if we're freeing a dnode, its space accounting must go to zero
* before we actually try to free the dnode, or we will trip an assertion. In
* addition, we know the case described above cannot occur, because the dnode is
* being freed. Therefore, we free the indirect blocks immediately in that
* case.
*/
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
boolean_t free_indirects, dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
Expand All @@ -249,6 +264,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);

/*
* If we modify this indirect block, and we are not freeing the
* dnode (!free_indirects), then this indirect block needs to get
* written to disk by dbuf_write(). If it is dirty, we know it will
* be written (otherwise, we would have incorrect on-disk state
* because the space would be freed but still referenced by the BP
* in this indirect block). Therefore we VERIFY that it is
* dirty.
*
* Our VERIFY covers some cases that do not actually have to be
* dirty, but the open-context code happens to dirty. E.g. if the
* blocks we are freeing are all holes, because in that case, we
* are only freeing part of this indirect block, so it is an
* ancestor of the first or last block to be freed. The first and
* last L1 indirect blocks are always dirtied by dnode_free_range().
*/
VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);

dbuf_release_bp(db);
bp = db->db.db_data;

Expand Down Expand Up @@ -284,32 +317,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);

free_children(subdb, blkid, nblks, tx);
free_children(subdb, blkid, nblks, free_indirects, tx);
dbuf_rele(subdb, FTAG);
}
}

/* If this whole block is free, free ourself too. */
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
break;
}
if (i == 1 << epbs) {
/*
* We only found holes. Grab the rwlock to prevent
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (free_indirects) {
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
rw_exit(&dn->dn_struct_rwlock);
free_blocks(dn, db->db_blkptr, 1, tx);
} else {
/*
* Partial block free; must be marked dirty so that it
* will be written out.
*/
ASSERT(db->db_dirtycnt > 0);
}

DB_DNODE_EXIT(db);
Expand All @@ -322,7 +339,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
*/
static void
dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
boolean_t free_indirects, dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
int dnlevel = dn->dn_phys->dn_nlevels;
Expand Down Expand Up @@ -362,7 +379,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);

free_children(db, blkid, nblks, tx);
free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
}
Expand All @@ -387,6 +404,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;

static void
Expand All @@ -396,7 +414,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
dnode_t *dn = dsfra->dsfra_dnode;

mutex_exit(&dn->dn_mtx);
dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
dnode_sync_free_range_impl(dn, blkid, nblks,
dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}

Expand Down Expand Up @@ -712,6 +731,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
dsfra.dsfra_free_indirects = freeing_dnode;
if (freeing_dnode) {
ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
0, dn->dn_maxblkid + 1));
}
mutex_enter(&dn->dn_mtx);
range_tree_vacate(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy',
'send_freeobjects', 'send_realloc_dnode_size']
'send_freeobjects', 'send_realloc_dnode_size', 'send_hole_birth']
tags = ['functional', 'rsend']

[tests/functional/scrub_mirror]
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/rsend/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ dist_pkgdata_SCRIPTS = \
send-c_zstreamdump.ksh \
send-cpL_varied_recsize.ksh \
send_freeobjects.ksh \
send_realloc_dnode_size.ksh
send_realloc_dnode_size.ksh \
send_hole_birth.ksh

dist_pkgdata_DATA = \
rsend.cfg \
Expand Down
47 changes: 42 additions & 5 deletions tests/zfs-tests/tests/functional/rsend/rsend.kshlib
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,20 @@ function cleanup_pools
destroy_pool $POOL3
}

function cmp_md5s {
typeset file1=$1
typeset file2=$2

eval md5sum $file1 | awk '{ print $1 }' > $BACKDIR/md5_file1
eval md5sum $file2 | awk '{ print $1 }' > $BACKDIR/md5_file2
diff $BACKDIR/md5_file1 $BACKDIR/md5_file2
typeset -i ret=$?

rm -f $BACKDIR/md5_file1 $BACKDIR/md5_file2

return $ret
}

#
# Detect if the given two filesystems have same sub-datasets
#
Expand Down Expand Up @@ -388,13 +402,36 @@ function mk_files
maxsize=$2
file_id_offset=$3
fs=$4
bs=512

for ((i=0; i<$nfiles; i=i+1)); do
dd if=/dev/urandom \
of=/$fs/file-$maxsize-$((i+$file_id_offset)) \
bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \
count=1 >/dev/null 2>&1 || log_fail \
"Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))"
file_name="/$fs/file-$maxsize-$((i+$file_id_offset))"
file_size=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1))

#
# Create an interesting mix of files which contain both
# data blocks and holes for more realistic test coverage.
# Half the files are created as sparse then partially filled,
# the other half is dense then a hole is punched in the file.
#
if [ $((RANDOM % 2)) -eq 0 ]; then
truncate -s $file_size $file_name || \
log_fail "Failed to create $file_name"
dd if=/dev/urandom of=$file_name \
bs=$bs count=$(($file_size / 2 / $bs)) \
seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \
conv=notrunc >/dev/null 2>&1 || \
log_fail "Failed to create $file_name"
else
dd if=/dev/urandom of=$file_name \
bs=$file_size count=1 >/dev/null 2>&1 || \
log_fail "Failed to create $file_name"
dd if=/dev/zero of=$file_name \
bs=$bs count=$(($file_size / 2 / $bs)) \
seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \
conv=notrunc >/dev/null 2>&1 || \
log_fail "Failed to create $file_name"
fi
done
echo Created $nfiles files of random sizes up to $maxsize bytes
}
Expand Down
Loading