From 43c9b567c7a8e0873ad52ff0b20c781a807ca63d Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 20 Sep 2016 10:02:29 -0700 Subject: [PATCH 1/2] OpenZFS 9438 - Holes can lose birth time info if a block has a mix of birth times As reported by https://github.com/zfsonlinux/zfs/issues/4996, there is yet another hole birth issue. In this one, if a block is entirely holes, but the birth times are not all the same, we lose that information by creating one hole with the current txg as its birth time. The ZoL PR's fix approach is incorrect. Ultimately, the problem here is that when you truncate and write a file in the same transaction group, the dbuf for the indirect block will be zeroed out to deal with the truncation, and then written for the write. During this process, we will lose hole birth time information for any holes in the range. In the case where a dnode is being freed, we need to determine whether the block should be converted to a higher-level hole in the zio pipeline, and if so do it when the dnode is being synced out. Porting Notes: * The DMU_OBJECT_END change in zfs_znode.c was already applied. * Added test cases from #5675 provided by @rincebrain for hole_birth issues. These test cases should be pushed upstream to OpenZFS. * Updated mk_files which is used by several rsend tests so the files created are a little more interesting and may contain holes. Authored by: Paul Dagnelie Reviewed by: Matt Ahrens Reviewed by: George Wilson Approved by: Robert Mustacchi Ported-by: Brian Behlendorf OpenZFS-issue: https://www.illumos.org/issues/9438 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/738e2a3c External-issue: DLPX-46861 --- module/zfs/dmu_object.c | 4 + module/zfs/dnode.c | 68 ++++++++++ module/zfs/dnode_sync.c | 54 ++++---- tests/runfiles/linux.run | 2 +- .../tests/functional/rsend/Makefile.am | 3 +- .../tests/functional/rsend/rsend.kshlib | 47 ++++++- .../functional/rsend/send_hole_birth.ksh | 123 ++++++++++++++++++ 7 files changed, 270 insertions(+), 31 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index b9960782efd1..afee869b6d76 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -311,6 +311,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7939a7ba6962..0be72e90a6bf 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1889,6 +1889,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -2040,6 +2106,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); + dnode_dirty_l1range(dn, first, last, tx); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 22b401ab5b98..3202faf49dac 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -230,9 +230,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; @@ -284,32 +299,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); - free_children(subdb, blkid, nblks, tx); + free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } - /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); free_blocks(dn, db->db_blkptr, 1, tx); - } else { - /* - * Partial block free; must be marked dirty so that it - * will be written out. - */ - ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); @@ -322,7 +321,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; @@ -362,7 +361,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, tx); + free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } @@ -387,6 +386,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void @@ -396,7 +396,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } @@ -712,6 +713,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 89563189fd01..c3e3887a8820 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -743,7 +743,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy', - 'send_freeobjects', 'send_realloc_dnode_size'] + 'send_freeobjects', 'send_realloc_dnode_size', 'send_hole_birth'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index fd00d6c49501..79e01d190fa6 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -39,7 +39,8 @@ dist_pkgdata_SCRIPTS = \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ send_freeobjects.ksh \ - send_realloc_dnode_size.ksh + send_realloc_dnode_size.ksh \ + send_hole_birth.ksh dist_pkgdata_DATA = \ rsend.cfg \ diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 909cd16776c4..72d2eb93d442 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -153,6 +153,20 @@ function cleanup_pools destroy_pool $POOL3 } +function cmp_md5s { + typeset file1=$1 + typeset file2=$2 + + eval md5sum $file1 | awk '{ print $1 }' > $BACKDIR/md5_file1 + eval md5sum $file2 | awk '{ print $1 }' > $BACKDIR/md5_file2 + diff $BACKDIR/md5_file1 $BACKDIR/md5_file2 + typeset -i ret=$? + + rm -f $BACKDIR/md5_file1 $BACKDIR/md5_file2 + + return $ret +} + # # Detect if the given two filesystems have same sub-datasets # @@ -388,13 +402,36 @@ function mk_files maxsize=$2 file_id_offset=$3 fs=$4 + bs=512 for ((i=0; i<$nfiles; i=i+1)); do - dd if=/dev/urandom \ - of=/$fs/file-$maxsize-$((i+$file_id_offset)) \ - bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \ - count=1 >/dev/null 2>&1 || log_fail \ - "Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))" + file_name="/$fs/file-$maxsize-$((i+$file_id_offset))" + file_size=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) + + # + # Create an interesting mix of files which contain both + # data blocks and holes for more realistic test coverage. + # Half the files are created as sparse then partially filled, + # the other half is dense then a hole is punched in the file. + # + if [ $((RANDOM % 2)) -eq 0 ]; then + truncate -s $file_size $file_name || \ + log_fail "Failed to create $file_name" + dd if=/dev/urandom of=$file_name \ + bs=$bs count=$(($file_size / 2 / $bs)) \ + seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \ + conv=notrunc >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + else + dd if=/dev/urandom of=$file_name \ + bs=$file_size count=1 >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + dd if=/dev/zero of=$file_name \ + bs=$bs count=$(($file_size / 2 / $bs)) \ + seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \ + conv=notrunc >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + fi done echo Created $nfiles files of random sizes up to $maxsize bytes } diff --git a/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh new file mode 100755 index 000000000000..c2b5ff7a0522 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh @@ -0,0 +1,123 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# DESCRIPTION: +# Verify send streams which contain holes. +# +# STRATEGY: +# 1. Create an initial file for the full send and snapshot. +# 2. Permute the file with holes and snapshot. +# 3. Send the full and incremental snapshots to a new pool. +# 4. Verify the contents of the files match. +# + +sendpool=$POOL +sendfs=$sendpool/sendfs + +recvpool=$POOL2 +recvfs=$recvpool/recvfs + +verify_runnable "both" + +log_assert "Test hole_birth" +log_onexit cleanup + +function cleanup +{ + cleanup_pool $sendpool + cleanup_pool $recvpool + set_tunable64 send_holes_without_birth_time 1 +} + +function send_and_verify +{ + log_must eval "zfs send $sendfs@snap1 > $BACKDIR/pool-snap1" + log_must eval "zfs receive -F $recvfs < $BACKDIR/pool-snap1" + + log_must eval "zfs send -i $sendfs@snap1 $sendfs@snap2 " \ + ">$BACKDIR/pool-snap1-snap2" + log_must eval "zfs receive $recvfs < $BACKDIR/pool-snap1-snap2" + + log_must cmp_md5s /$sendfs/file1 /$recvfs/file1 +} + +# By default sending hole_birth times is disabled. This functionality needs +# to be re-enabled for this test case to verify correctness. Once we're +# comfortable that all hole_birth bugs has been resolved this behavior may +# be re-enabled by default. +log_must set_tunable64 send_holes_without_birth_time 0 + +# Incremental send truncating the file and adding new data. +log_must zfs create -o recordsize=4k $sendfs + +log_must truncate -s 1G /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=11264 seek=1152 +log_must zfs snapshot $sendfs@snap1 + +log_must truncate -s 4194304 /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=152 seek=384 \ + conv=notrunc +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=10 seek=1408 \ + conv=notrunc +log_must zfs snapshot $sendfs@snap2 + +send_and_verify +log_must cleanup_pool $sendpool +log_must cleanup_pool $recvpool + +# Incremental send appending a hole and data. +log_must zfs create -o recordsize=512 $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=1 +log_must zfs snapshot $sendfs@snap1 + +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=3 +log_must zfs snapshot $sendfs@snap2 + +send_and_verify +log_must cleanup_pool $sendpool +log_must cleanup_pool $recvpool + +# Incremental send truncating the file and adding new data. +log_must zfs create -o recordsize=512 $sendfs + +log_must truncate -s 300M /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=128k conv=notrunc +log_must zfs snapshot $sendfs@snap1 + +log_must truncate -s 10M /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=1 seek=96k \ + conv=notrunc +log_must zfs snapshot $sendfs@snap2 + +send_and_verify + +log_pass "Test hole_birth" From 0f0a630dac7286cdb2814397e768cb966f1493f1 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 31 Oct 2016 10:42:37 -0700 Subject: [PATCH 2/2] OpenZFS 9439 - ZFS double-free due to failure to dirty indirect block Follow up commit for OpenZFS 9438. See the OpenZFS-issue link below for a complete analysis. Authored by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Paul Dagnelie Approved by: Robert Mustacchi Ported-by: Brian Behlendorf OpenZFS-issue: https://illumos.org/issues/9439 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/779220d External-issue: DLPX-46861 --- module/zfs/dnode.c | 6 ++---- module/zfs/dnode_sync.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 0be72e90a6bf..26471fda0b2a 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1987,13 +1987,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. - * Note that we won't dirty any indirect blocks, - * which is fine because we will be freeing the entire - * file and thus all indirect blocks will be freed - * by free_children(). */ blkid = 0; nblks = 1; + if (dn->dn_nlevels > 1) + dnode_dirty_l1(dn, 0, tx); goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 3202faf49dac..b1f734a8210e 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -264,6 +264,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + /* + * If we modify this indirect block, and we are not freeing the + * dnode (!free_indirects), then this indirect block needs to get + * written to disk by dbuf_write(). If it is dirty, we know it will + * be written (otherwise, we would have incorrect on-disk state + * because the space would be freed but still referenced by the BP + * in this indirect block). Therefore we VERIFY that it is + * dirty. + * + * Our VERIFY covers some cases that do not actually have to be + * dirty, but the open-context code happens to dirty. E.g. if the + * blocks we are freeing are all holes, because in that case, we + * are only freeing part of this indirect block, so it is an + * ancestor of the first or last block to be freed. The first and + * last L1 indirect blocks are always dirtied by dnode_free_range(). + */ + VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dbuf_release_bp(db); bp = db->db.db_data;