Skip to content

Commit

Permalink
WIP issue 2094.
Browse files Browse the repository at this point in the history
Commit 1421c89 expanded the size of a zbookmark_t from 24 to 25 64-bit
values which similarly expands the size of the "scan" entry in the pool's
object directory and causes the pool to become un-importable by other
ZFS implementations.

This patch is a work-in-progress second cut at a fix.  It does two
things: first, replaces the scn_bookmark member of dsl_scan_phys_t
with 4 members representing the members of a zbookmark_t.  Second, it
relaxes the size check during import to alow a broken pool to import by
allowing an EOVERFLOW that occurs when the "scan" entry is too long by
a single integer.
  • Loading branch information
dweeezil committed Feb 2, 2014
1 parent 8b46464 commit 10fe651
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 30 deletions.
22 changes: 21 additions & 1 deletion include/sys/dsl_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,13 @@ typedef struct dsl_scan_phys {
uint64_t scn_errors; /* scan I/O error count */
uint64_t scn_ddt_class_max;
ddt_bookmark_t scn_ddt_bookmark;
zbookmark_t scn_bookmark;

/* The next 4 members represent a bookmark_t */
uint64_t scn_zb_objset;
uint64_t scn_zb_object;
uint64_t scn_zb_level;
uint64_t scn_zb_blkid;

uint64_t scn_flags; /* dsl_scan_flags_t */
} dsl_scan_phys_t;

Expand All @@ -72,6 +78,20 @@ typedef enum dsl_scan_flags {
DSF_VISIT_DS_AGAIN = 1<<0,
} dsl_scan_flags_t;

#define SCAN_COMPARE_ZBOOKMARK(scan, zb) \
((scan)->scn_zb_objset == (zb)->zb_objset && \
(scan)->scn_zb_object == (zb)->zb_object && \
(scan)->scn_zb_level == (zb)->zb_level && \
(scan)->scn_zb_blkid == (zb)->zb_blkid)

#define SCAN_SET_BOOKMARK(scan, objset, object, level, blkid) \
{ \
(scan)->scn_zb_objset = objset; \
(scan)->scn_zb_object = object; \
(scan)->scn_zb_level = level; \
(scan)->scn_zb_blkid = blkid; \
}

/*
* Every pool will have one dsl_scan_t and this structure will contain
* in-memory information about the scan and a pointer to the on-disk
Expand Down
10 changes: 10 additions & 0 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,16 @@ struct zbookmark {
(zb)->zb_level == ZB_ROOT_LEVEL && \
(zb)->zb_blkid == ZB_ROOT_BLKID)

#define SCAN_ZB_IS_ZERO(scan) \
((scan)->scn_zb_objset == 0 && \
(scan)->scn_zb_object == 0 && \
(scan)->scn_zb_level == 0 && \
(scan)->scn_zb_blkid == 0)
#define SCAN_ZB_IS_ROOT(scan) \
((scan)->scn_zb_object == ZB_ROOT_OBJECT && \
(scan)->scn_zb_level == ZB_ROOT_LEVEL && \
(scan)->scn_zb_blkid == ZB_ROOT_BLKID)

typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
Expand Down
78 changes: 49 additions & 29 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
&scn->scn_phys);
if (err == ENOENT)
return (0);
else if (err)
if (err == EOVERFLOW) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1,
&scn->scn_phys);
}
if (err)
return (err);

if (scn->scn_phys.scn_state == DSS_SCANNING &&
Expand Down Expand Up @@ -401,7 +406,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */

if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
if (!SCAN_ZB_IS_ZERO(&scn->scn_phys))
return (B_FALSE); /* we're resuming */

/* We only know how to resume from level-0 blocks. */
Expand All @@ -421,7 +426,10 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
scn->scn_phys.scn_bookmark = *zb;
scn->scn_phys.scn_zb_objset = zb->zb_objset;
scn->scn_phys.scn_zb_object = zb->zb_object;
scn->scn_phys.scn_zb_level = zb->zb_level;
scn->scn_phys.scn_zb_blkid = zb->zb_blkid;
}
dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
Expand Down Expand Up @@ -552,13 +560,19 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
/*
* We never skip over user/group accounting objects (obj<0)
*/
if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
if (!SCAN_ZB_IS_ZERO(&scn->scn_phys) &&
(int64_t)zb->zb_object >= 0) {
zbookmark_t tmpbm;

/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
SET_BOOKMARK(&tmpbm, scn->scn_phys.scn_zb_objset,
scn->scn_phys.scn_zb_object,
scn->scn_phys.scn_zb_level,
scn->scn_phys.scn_zb_blkid);
if (zbookmark_is_before(dnp, zb, &tmpbm))
return (B_TRUE);

/*
Expand All @@ -567,14 +581,17 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
* indicate that it's OK to start checking for pausing
* again.
*/
if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
if (SCAN_COMPARE_ZBOOKMARK(&scn->scn_phys, zb) == 0 ||
zb->zb_object > scn->scn_phys.scn_zb_object) {
dprintf("resuming at %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
scn->scn_phys.scn_zb_objset = 0;
scn->scn_phys.scn_zb_object = 0;
scn->scn_phys.scn_zb_level = 0;
scn->scn_phys.scn_zb_blkid = 0;
}
}
return (B_FALSE);
Expand Down Expand Up @@ -812,18 +829,18 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;

if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
if (scn->scn_phys.scn_zb_objset == ds->ds_object) {
if (dsl_dataset_is_snapshot(ds)) {
/* Note, scn_cur_{min,max}_txg stays the same. */
scn->scn_phys.scn_bookmark.zb_objset =
scn->scn_phys.scn_zb_objset =
ds->ds_phys->ds_next_snap_obj;
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)ds->ds_phys->ds_next_snap_obj);
scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
} else {
SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
SCAN_SET_BOOKMARK(&scn->scn_phys,
ZB_DESTROYED_OBJSET, 0, 0, 0);
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset bookmark to -1,0,0,0",
Expand Down Expand Up @@ -875,8 +892,8 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)

ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);

if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset =
if (scn->scn_phys.scn_zb_objset == ds->ds_object) {
scn->scn_phys.scn_zb_objset =
ds->ds_phys->ds_prev_snap_obj;
zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
"reset zb_objset to %llu",
Expand Down Expand Up @@ -907,14 +924,14 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;

if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
if (scn->scn_phys.scn_zb_objset == ds1->ds_object) {
scn->scn_phys.scn_zb_objset = ds2->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds1->ds_object,
(u_longlong_t)ds2->ds_object);
} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
} else if (scn->scn_phys.scn_zb_objset == ds2->ds_object) {
scn->scn_phys.scn_zb_objset = ds1->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds2->ds_object,
Expand Down Expand Up @@ -1255,7 +1272,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
return;
}

if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
if (scn->scn_phys.scn_zb_objset == DMU_META_OBJSET) {
/* First do the MOS & ORIGIN */

scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
Expand All @@ -1274,15 +1291,15 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
dp->dp_origin_snap->ds_object, tx);
}
ASSERT(!scn->scn_pausing);
} else if (scn->scn_phys.scn_bookmark.zb_objset !=
} else if (scn->scn_phys.scn_zb_objset !=
ZB_DESTROYED_OBJSET) {
/*
* If we were paused, continue from here. Note if the
* ds we were paused on was deleted, the zb_objset may
* be -1, so we will skip this and find a new objset
* below.
*/
dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
dsl_scan_visitds(scn, scn->scn_phys.scn_zb_objset, tx);
if (scn->scn_pausing)
return;
}
Expand All @@ -1291,7 +1308,10 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
* In case we were paused right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
scn->scn_phys.scn_zb_objset = 0;
scn->scn_phys.scn_zb_object = 0;
scn->scn_phys.scn_zb_level = 0;
scn->scn_phys.scn_zb_blkid = 0;
zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE);
za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE);

Expand Down Expand Up @@ -1500,17 +1520,17 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
(longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
ASSERT(scn->scn_phys.scn_zb_objset == 0);
ASSERT(scn->scn_phys.scn_zb_object == 0);
ASSERT(scn->scn_phys.scn_zb_level == 0);
ASSERT(scn->scn_phys.scn_zb_blkid == 0);
} else {
zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
(longlong_t)tx->tx_txg,
(longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
(longlong_t)scn->scn_phys.scn_bookmark.zb_object,
(longlong_t)scn->scn_phys.scn_bookmark.zb_level,
(longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
(longlong_t)scn->scn_phys.scn_zb_objset,
(longlong_t)scn->scn_phys.scn_zb_object,
(longlong_t)scn->scn_phys.scn_zb_level,
(longlong_t)scn->scn_phys.scn_zb_blkid);
}

scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
Expand Down

0 comments on commit 10fe651

Please sign in to comment.