Skip to content

Commit

Permalink
OpenZFS 9238 - ZFS Spacemap Encoding V2
Browse files Browse the repository at this point in the history
Motivation
==========

The current space map encoding has the following disadvantages:
[1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
    This makes the encoding very inefficient for large regions of space.
[2] As vdev-wide space maps have started to be used by new features (i.e.
    device removal, zpool checkpoint) we've started imposing limits in the
    vdevs that can be used with them based on the maximum addressable offset
    (currently 64PB for a top-level vdev).

New encoding
============

The layout can be found at space_map.h and it remains backwards compatible with
the old one. The introduced two-word entry format, besides extending the limits
imposed by the single-entry layout, also includes a vdev field and some extra
padding after its prefix.

The extra padding after the prefix should is reserved for future usage (e.g.
new prefixes for future encodings or new fields for flags). The new vdev field
not only makes the space maps more self-descriptive, but also opens the doors
for pool-wide space maps (expected to be used in the log spacemap project).

One final important note is that the number of bits used for vdevs is reduced
to 24 bits for blkptrs. That was decided as we don't know of any setups that
use more than 16M vdevs for the time being and we wanted to fit the vdev field
in the space map. In addition that gives us some extra bits in dva_t.

Other references:
=================

The new encoding is also discussed towards the end of the Log Space Map
presentation from 2017's OpenZFS summit.
Link: https://www.youtube.com/watch?v=jj2IxRkl5bQ

Authored by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Approved by: Gordon Ross <[email protected]>
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Tim Chase <[email protected]>

OpenZFS-commit: openzfs/openzfs@90a56e6d
OpenZFS-issue: https://www.illumos.org/issues/9238
Closes openzfs#7665
  • Loading branch information
sdimitro authored and behlendorf committed Jul 5, 2018
1 parent 4e82b4b commit 4d044c4
Show file tree
Hide file tree
Showing 15 changed files with 855 additions and 361 deletions.
125 changes: 77 additions & 48 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,6 @@ verify_spacemap_refcounts(spa_t *spa)
static void
dump_spacemap(objset_t *os, space_map_t *sm)
{
uint64_t alloc, offset, entry;
const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
"INVALID", "INVALID", "INVALID", "INVALID" };

Expand All @@ -793,41 +792,73 @@ dump_spacemap(objset_t *os, space_map_t *sm)
/*
* Print out the freelist entries in both encoded and decoded form.
*/
alloc = 0;
for (offset = 0; offset < space_map_length(sm);
offset += sizeof (entry)) {
uint8_t mapshift = sm->sm_shift;
uint8_t mapshift = sm->sm_shift;
int64_t alloc = 0;
uint64_t word;
for (uint64_t offset = 0; offset < space_map_length(sm);
offset += sizeof (word)) {

VERIFY0(dmu_read(os, space_map_object(sm), offset,
sizeof (entry), &entry, DMU_READ_PREFETCH));
if (SM_DEBUG_DECODE(entry)) {
sizeof (word), &word, DMU_READ_PREFETCH));

if (sm_entry_is_debug(word)) {
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
(u_longlong_t)(offset / sizeof (entry)),
ddata[SM_DEBUG_ACTION_DECODE(entry)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
(u_longlong_t)(offset / sizeof (word)),
ddata[SM_DEBUG_ACTION_DECODE(word)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
continue;
}

uint8_t words;
char entry_type;
uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;

if (sm_entry_is_single_word(word)) {
entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
'A' : 'F';
entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
sm->sm_start;
entry_run = SM_RUN_DECODE(word) << mapshift;
words = 1;
} else {
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx\n",
(u_longlong_t)(offset / sizeof (entry)),
SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
mapshift) + sm->sm_start),
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
mapshift) + sm->sm_start +
(SM_RUN_DECODE(entry) << mapshift)),
(u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
if (SM_TYPE_DECODE(entry) == SM_ALLOC)
alloc += SM_RUN_DECODE(entry) << mapshift;
else
alloc -= SM_RUN_DECODE(entry) << mapshift;
/* it is a two-word entry so we read another word */
ASSERT(sm_entry_is_double_word(word));

uint64_t extra_word;
offset += sizeof (extra_word);
VERIFY0(dmu_read(os, space_map_object(sm), offset,
sizeof (extra_word), &extra_word,
DMU_READ_PREFETCH));

ASSERT3U(offset, <=, space_map_length(sm));

entry_run = SM2_RUN_DECODE(word) << mapshift;
entry_vdev = SM2_VDEV_DECODE(word);
entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
'A' : 'F';
entry_off = (SM2_OFFSET_DECODE(extra_word) <<
mapshift) + sm->sm_start;
words = 2;
}

(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
(u_longlong_t)(offset / sizeof (word)),
entry_type, (u_longlong_t)entry_off,
(u_longlong_t)(entry_off + entry_run),
(u_longlong_t)entry_run,
(u_longlong_t)entry_vdev, words);

if (entry_type == 'A')
alloc += entry_run;
else
alloc -= entry_run;
}
if (alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%llu) INCONSISTENT "
"with space map summary (%llu)\n",
(u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
if ((uint64_t)alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
"with space map summary (%lld)\n",
(longlong_t)space_map_allocated(sm), (longlong_t)alloc);
}
}

Expand Down Expand Up @@ -1158,7 +1189,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
dump_dde(ddt, &dde, walk);

ASSERT(error == ENOENT);
ASSERT3U(error, ==, ENOENT);

(void) printf("\n");
}
Expand Down Expand Up @@ -3579,15 +3610,14 @@ typedef struct checkpoint_sm_exclude_entry_arg {
} checkpoint_sm_exclude_entry_arg_t;

static int
checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
void *arg)
checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
{
checkpoint_sm_exclude_entry_arg_t *cseea = arg;
vdev_t *vd = cseea->cseea_vd;
metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
uint64_t end = offset + size;
metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
uint64_t end = sme->sme_offset + sme->sme_run;

ASSERT(type == SM_FREE);
ASSERT(sme->sme_type == SM_FREE);

/*
* Since the vdev_checkpoint_sm exists in the vdev level
Expand All @@ -3605,18 +3635,18 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
* metaslab boundaries. So if needed we could add code
* that handles metaslab-crossing segments in the future.
*/
VERIFY3U(offset, >=, ms->ms_start);
VERIFY3U(sme->sme_offset, >=, ms->ms_start);
VERIFY3U(end, <=, ms->ms_start + ms->ms_size);

/*
* By removing the entry from the allocated segments we
* also verify that the entry is there to begin with.
*/
mutex_enter(&ms->ms_lock);
range_tree_remove(ms->ms_allocatable, offset, size);
range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);

cseea->cseea_checkpoint_size += size;
cseea->cseea_checkpoint_size += sme->sme_run;
return (0);
}

Expand Down Expand Up @@ -4606,15 +4636,14 @@ typedef struct verify_checkpoint_sm_entry_cb_arg {
#define ENTRIES_PER_PROGRESS_UPDATE 10000

static int
verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
void *arg)
verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
{
verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
vdev_t *vd = vcsec->vcsec_vd;
metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
uint64_t end = offset + size;
metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
uint64_t end = sme->sme_offset + sme->sme_run;

ASSERT(type == SM_FREE);
ASSERT(sme->sme_type == SM_FREE);

if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
(void) fprintf(stderr,
Expand All @@ -4628,7 +4657,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
/*
* See comment in checkpoint_sm_exclude_entry_cb()
*/
VERIFY3U(offset, >=, ms->ms_start);
VERIFY3U(sme->sme_offset, >=, ms->ms_start);
VERIFY3U(end, <=, ms->ms_start + ms->ms_size);

/*
Expand All @@ -4637,7 +4666,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
* their respective ms_allocateable trees should not contain them.
*/
mutex_enter(&ms->ms_lock);
range_tree_verify(ms->ms_allocatable, offset, size);
range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);

return (0);
Expand Down Expand Up @@ -4883,7 +4912,7 @@ verify_checkpoint(spa_t *spa)
DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);

if (error == ENOENT) {
if (error == ENOENT && !dump_opt['L']) {
/*
* If the feature is active but the uberblock is missing
* then we must be in the middle of discarding the
Expand All @@ -4906,7 +4935,7 @@ verify_checkpoint(spa_t *spa)
error = 3;
}

if (error == 0)
if (error == 0 && !dump_opt['L'])
verify_checkpoint_blocks(spa);

return (error);
Expand Down Expand Up @@ -5015,7 +5044,7 @@ dump_zpool(spa_t *spa)
if (dump_opt['h'])
dump_history(spa);

if (rc == 0 && !dump_opt['L'])
if (rc == 0)
rc = verify_checkpoint(spa);

if (rc != 0) {
Expand Down
7 changes: 7 additions & 0 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled;
extern int dmu_object_alloc_chunk_shift;
extern boolean_t zfs_force_some_double_word_sm_entries;

static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
Expand Down Expand Up @@ -7349,6 +7350,12 @@ main(int argc, char **argv)

dprintf_setup(&argc, argv);
zfs_deadman_synctime_ms = 300000;
/*
* As two-word space map entries may not come up often (especially
* if pool and vdev sizes are small) we want to force at least some
* of them so the feature get tested.
*/
zfs_force_some_double_word_sm_entries = B_TRUE;

action.sa_handler = sig_handler;
sigemptyset(&action.sa_mask);
Expand Down
12 changes: 7 additions & 5 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ _NOTE(CONSTCOND) } while (0)
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */

#define SPA_COMPRESSBITS 7
#define SPA_VDEVBITS 24

/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
Expand All @@ -177,15 +178,15 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | vdev1 | GRID | ASIZE |
* 0 | pad | vdev1 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | vdev2 | GRID | ASIZE |
* 2 | pad | vdev2 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 4 | vdev3 | GRID | ASIZE |
* 4 | pad | vdev3 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
Expand Down Expand Up @@ -443,8 +444,9 @@ typedef struct blkptr {
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)

#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
#define DVA_SET_VDEV(dva, x) \
BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)

#define DVA_GET_OFFSET(dva) \
BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
Expand Down
Loading

0 comments on commit 4d044c4

Please sign in to comment.