Skip to content

Commit

Permalink
Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol
Browse files Browse the repository at this point in the history
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
Reviewed by: George Wilson <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Approved by: Dan McDonald <[email protected]>

References:
  https://www.illumos.org/issues/4390
  illumos/illumos-gate@7fd05ac

Porting notes:

Previous stack-reduction efforts in traverse_visitb() caused a fair
number of un-mergable pieces of code.  This patch should reduce its
stack footprint a bit more.

The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated
using kmem_zalloc() for the purpose of stack reduction.

The new global zfs_free_leak_on_eio has been defined as an integer
rather than a boolean_t as was the case with the related zfs_recover
global.  Also, zfs_free_leak_on_eio's definition has been inserted into
zfs_debug.c for consistency with the existing definition of zfs_recover.
Illumos placed it in spa_misc.c.

Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes openzfs#2545
  • Loading branch information
ahrens authored and behlendorf committed Aug 4, 2014
1 parent 9b67f60 commit fbeddd6
Show file tree
Hide file tree
Showing 17 changed files with 339 additions and 157 deletions.
3 changes: 2 additions & 1 deletion include/sys/bptree.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPTREE_H
Expand Down Expand Up @@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
boolean_t bptree_is_empty(objset_t *os, uint64_t obj);

void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
Expand Down
1 change: 0 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size);

#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
#define DMU_DEADLIST_OBJECT (-3ULL)

/*
* artificial blkids for bonus buffer and spill blocks
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_dir.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
#define ORIGIN_DIR_NAME "$ORIGIN"
#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
#define LEAK_DIR_NAME "$LEAK"

#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ typedef struct dsl_pool {
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
struct dsl_dir *dp_free_dir;
struct dsl_dir *dp_leak_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ typedef struct dsl_scan {
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;

/* for debugging / information */
uint64_t scn_visited_this_txg;
Expand Down
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ typedef enum {
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extern "C" {

extern int zfs_flags;
extern int zfs_recover;
extern int zfs_free_leak_on_eio;

#define ZFS_DEBUG_DPRINTF (1<<0)
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
Expand Down
1 change: 1 addition & 0 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
case ZPOOL_PROP_FREEING:
case ZPOOL_PROP_LEAKED:
case ZPOOL_PROP_EXPANDSZ:
case ZPOOL_PROP_ASHIFT:
if (literal)
Expand Down
37 changes: 37 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,43 @@ Set additional debugging flags
Default value: \fB1\fR.
.RE

.sp
.ne 2
.na
\fBzfs_free_leak_on_eio\fR (int)
.ad
.RS 12n
If destroy encounters an EIO while reading metadata (e.g. indirect
blocks), space referenced by the missing metadata can not be freed.
Normally this causes the background destroy to become "stalled", as
it is unable to make forward progress. While in this stalled state,
all remaining space to free from the error-encountering filesystem is
"temporarily leaked". Set this flag to cause it to ignore the EIO,
permanently leak the space from indirect blocks that can not be read,
and continue to free everything else that it can.

The default, "stalling" behavior is useful if the storage partially
fails (i.e. some but not all i/os fail), and then later recovers. In
this case, we will be able to continue pool operations while it is
partially failed, and when it recovers, we can continue to free the
space, with no leaks. However, note that this case is actually
fairly rare.

Typically pools either (a) fail completely (but perhaps temporarily,
e.g. a top-level vdev going offline), or (b) have localized,
permanent errors (e.g. disk returns the wrong data due to bit flip or
firmware bug). In case (a), this setting does not matter because the
pool will be suspended and the sync thread will not be able to make
forward progress regardless. In case (b), because the error is
permanent, the best we can do is leak the minimum amount of space,
which is what setting this flag will do. Therefore, it is reasonable
for this flag to normally be set, but we chose the more conservative
approach of not setting it, so that there is no possibility of
leaking space in the "partial temporary" failure case.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
Expand Down
2 changes: 2 additions & 0 deletions module/zcommon/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "<size>", "FREE");
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "FREEING");
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "LEAKED");
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
Expand Down
109 changes: 87 additions & 22 deletions module/zfs/bptree.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
return (dmu_object_free(os, obj, tx));
}

boolean_t
bptree_is_empty(objset_t *os, uint64_t obj)
{
dmu_buf_t *db;
bptree_phys_t *bt;
boolean_t rv;

VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
rv = (bt->bt_begin == bt->bt_end);
dmu_buf_rele(db, FTAG);
return (rv);
}

void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
{
dmu_buf_t *db;
bptree_phys_t *bt;
bptree_entry_phys_t bte;
bptree_entry_phys_t *bte;

/*
* bptree objects are in the pool mos, therefore they can only be
Expand All @@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;

bte.be_birth_txg = birth_txg;
bte.be_bp = *bp;
bzero(&bte.be_zb, sizeof (bte.be_zb));
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
bte->be_birth_txg = birth_txg;
bte->be_bp = *bp;
dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
kmem_free(bte, sizeof (*bte));

dmu_buf_will_dirty(db, tx);
bt->bt_end++;
Expand Down Expand Up @@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (err);
}

/*
* If "free" is set:
* - It is assumed that "func" will be freeing the block pointers.
* - If "func" returns nonzero, the bookmark will be remembered and
* iteration will be restarted from this point on next invocation.
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
* bptree_iterate will remember the bookmark, continue traversing
* any additional entries, and return 0.
*
* If "free" is not set, traversal will stop and return an error if
* an i/o error is encountered.
*
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
* traverse_dataset_destroyed()).
*/
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
Expand All @@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;

ASSERT(!free || i == ba.ba_phys->bt_begin);

err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;

if (zfs_recover)
if (zfs_free_leak_on_eio)
flags |= TRAVERSE_HARD;
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
"bookmark %lld/%lld/%lld/%lld",
i, (longlong_t)bte.be_birth_txg,
(longlong_t)bte.be_zb.zb_objset,
(longlong_t)bte.be_zb.zb_object,
(longlong_t)bte.be_zb.zb_level,
(longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
if (err == ERESTART) {
/*
* The callback has freed the visited block pointers.
* Record our traversal progress on disk, either by
* updating this record's bookmark, or by logically
* removing this record by advancing bt_begin.
*/
if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
ZB_DESTROYED_OBJSET);
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
break;
}
if (err != 0) {
if (err == EIO || err == ECKSUM ||
err == ENXIO) {
/*
* Skip the rest of this tree and
* continue on to the next entry.
*/
err = 0;
ioerr = B_TRUE;
} else {
break;
}
} else if (ioerr) {
/*
* We can not properly handle an i/o
* error, because the traversal code
* does not know how to resume from an
* arbitrary bookmark.
* This entry is finished, but there were
* i/o errors on previous entries, so we
* can't adjust bt_begin. Set this entry's
* be_birth_txg such that it will be
* treated as a no-op in future traversals.
*/
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
bte.be_birth_txg = UINT64_MAX;
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
}

ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
if (!ioerr) {
ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
}
} else if (err != 0) {
break;
}
}

ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
ASSERT(!free || err != 0 || ioerr ||
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);

/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
if (zfs_free_leak_on_eio) {
ba.ba_phys->bt_bytes = 0;
ba.ba_phys->bt_comp = 0;
ba.ba_phys->bt_uncomp = 0;
}

ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);
Expand Down
Loading

0 comments on commit fbeddd6

Please sign in to comment.