From d891e166580dd22c7ea0683b6a2431f81fe20a8f Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Thu, 17 Dec 2015 17:46:29 -0500 Subject: [PATCH] Set proper birth epochs for Ln (n > 0) holes --- module/zfs/dbuf.c | 82 ++++++++++++++++++++++++++++++++++++++--- module/zfs/dnode_sync.c | 10 ++--- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 31242d60154e..d0d288303451 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -541,13 +541,35 @@ dbuf_verify(dmu_buf_impl_t *db) * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. + * + * One exception is Ln indirect block representing + * a hole with non-zero birth epoch; such blocks can also + * be filled with Ln-1 hole block pointers */ if (db->db_dirtycnt == 0) { - ASSERTV(uint64_t *buf = db->db.db_data); - int i; + if (db->db_level == 0) { + uint64_t *buf = db->db.db_data; + int i; - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } else { + blkptr_t zero_bp, *bp; + + ASSERT3U(1ULL << DB_DNODE(db)->dn_indblkshift, + ==, db->db.db_size); + BP_ZERO(&zero_bp); + for (bp = (blkptr_t *)db->db.db_data; + bp < (blkptr_t *)((char *)db->db.db_data + + db->db.db_size); + bp++) { + ASSERT(BP_EQUAL(bp, &zero_bp) || + (BP_IS_HOLE(bp) && + (BP_GET_TYPE(bp) == dn->dn_type) && + (BP_GET_LEVEL(bp) == + (db->db_level - 1)))); + } } } } @@ -691,11 +713,44 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dmu_object_type_t dtype = dn->dn_type; + uint32_t bshift = (db->db_level > 1) ? + dn->dn_indblkshift : dn->dn_datablkshift; DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); + + if (db->db_level > 0 && db->db_blkptr && + BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth > 0) { + blkptr_t *bp; + /* + * If we are reading an Ln hole using a block pointer + * with the birth epoch that is not zero, this means + * that the hole_birth was active when the block pointer + * was written, and we cannot simply zero the buffer + * out: if the block is subsequently partially + * overwritten, then the remaining Ln-1 holes will have + * their birth epochs equal to zero, which can lead to + * incorrectly formed incremental send stream. + * Instead, we pre-fill the initial contents of the Ln + * block with block pointers indicating Ln-1 holes with + * the same type and birth epoch as the Ln hole; the + * logical size is set based on the hole level and the + * dnode data or indirect block shift + */ + for (bp = (blkptr_t *)db->db.db_data; + bp < (blkptr_t *)((char *)db->db.db_data + + db->db.db_size); + bp++) { + BP_SET_TYPE(bp, dtype); + BP_SET_LEVEL(bp, db->db_level-1); + BP_SET_LSIZE(bp, 1ULL << bshift); + BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + } + } + db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); @@ -1274,7 +1329,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + dprintf_dbuf(db, "thread %p size=%llx txg=%llx db=%p\n", + curthread, (u_longlong_t)db->db.db_size, tx->tx_txg, db); if (db->db_blkid != DMU_BONUS_BLKID) { /* @@ -1484,7 +1540,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + dprintf_dbuf(db, "thread %p size=%llx txg=%llx db=%p\n", + curthread, (u_longlong_t)db->db.db_size, tx->tx_txg, db); ASSERT(db->db.db_size != 0); @@ -2831,12 +2888,25 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } else { blkptr_t *ibp = db->db.db_data; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } + /* + * it turns out that pre-filling Ln holes in dbuf_read_impl() + * can result in metadata block ids pointing beyond the + * dn_maxblkid; this seems like a good place to fix dn_maxblkid + * up in a manner similar to one above for level-0 blocks + */ + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) + dn->dn_phys->dn_maxblkid = ((db->db_blkid + 1) << + (db->db_level * epbs)) - 1; + mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index df5c8e4ee6c4..31d5e4dbb16c 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -62,7 +62,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* check for existing blkptrs in the dnode */ for (i = 0; i < nblkptr; i++) - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]) || + dn->dn_phys->dn_blkptr[i].blk_birth != 0) break; if (i != nblkptr) { /* transfer dnode's block pointers to new indirect block */ @@ -710,10 +711,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) int i; ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); /* the blkptrs we are losing better be unallocated */ - for (i = 0; i < dnp->dn_nblkptr; i++) { - if (i >= dn->dn_next_nblkptr[txgoff]) - ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); - } + for (i = dn->dn_next_nblkptr[txgoff]; + i < dnp->dn_nblkptr; i++) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); #endif } mutex_enter(&dn->dn_mtx);