Skip to content

Commit

Permalink
Fix kernel panic induced by redacted send
Browse files Browse the repository at this point in the history
In the redaction list traversal code, there is a bug in the binary search
logic when looking for the resume point. Maxbufid can be decremented to -1,
causing us to read the last possible block of the object instead of the one we
wanted. This can cause incorrect resume behavior, or possibly even a hang in
some cases. In addition, when examining non-last blocks, we can treat the
block as being the same size as the last block, causing us to miss entries in
the redaction list when determining where to resume. Finally, we were ignoring
the case where the resume point was found in the buffer being searched, and
resuming from minbufid. All these issues have been corrected, and the code has
been significantly simplified to make future issues less likely.

Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Paul Dagnelie <[email protected]>
Closes openzfs#11297
  • Loading branch information
pcd1193182 authored and RageLtMan committed May 31, 2021
1 parent b96cc66 commit 894fddb
Showing 1 changed file with 53 additions and 61 deletions.
114 changes: 53 additions & 61 deletions module/zfs/dsl_bookmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -1561,33 +1561,6 @@ dsl_bookmark_latest_txg(dsl_dataset_t *ds)
return (dbn->dbn_phys.zbm_creation_txg);
}

static inline unsigned int
redact_block_buf_num_entries(unsigned int size)
{
return (size / sizeof (redact_block_phys_t));
}

/*
* This function calculates the offset of the last entry in the array of
* redact_block_phys_t. If we're reading the redaction list into buffers of
* size bufsize, then for all but the last buffer, the last valid entry in the
* array will be the last entry in the array. However, for the last buffer, any
* amount of it may be filled. Thus, we check to see if we're looking at the
* last buffer in the redaction list, and if so, we return the total number of
* entries modulo the number of entries per buffer. Otherwise, we return the
* number of entries per buffer minus one.
*/
static inline unsigned int
last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid)
{
if (bufid == (rl->rl_phys->rlp_num_entries - 1) /
redact_block_buf_num_entries(bufsize)) {
return ((rl->rl_phys->rlp_num_entries - 1) %
redact_block_buf_num_entries(bufsize));
}
return (redact_block_buf_num_entries(bufsize) - 1);
}

/*
* Compare the redact_block_phys_t to the bookmark. If the last block in the
* redact_block_phys_t is before the bookmark, return -1. If the first block in
Expand Down Expand Up @@ -1633,8 +1606,6 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
rl_traverse_callback_t cb, void *arg)
{
objset_t *mos = rl->rl_mos;
redact_block_phys_t *buf;
unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
int err = 0;

if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
Expand All @@ -1651,42 +1622,48 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
}

/*
* Binary search for the point to resume from. The goal is to minimize
* the number of disk reads we have to perform.
* This allows us to skip the binary search and resume checking logic
* below, if we're not resuming a redacted send.
*/
if (ZB_IS_ZERO(resume))
resume = NULL;

/*
* Binary search for the point to resume from.
*/
buf = zio_data_buf_alloc(bufsize);
uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) /
redact_block_buf_num_entries(bufsize);
uint64_t minbufid = 0;
while (resume != NULL && maxbufid - minbufid >= 1) {
ASSERT3U(maxbufid, >, minbufid);
uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2);
err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize,
buf, DMU_READ_NO_PREFETCH);
uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
uint64_t minidx = 0;
while (resume != NULL && maxidx > minidx) {
redact_block_phys_t rbp = { 0 };
ASSERT3U(maxidx, >, minidx);
uint64_t mididx = minidx + ((maxidx - minidx) / 2);
err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
if (err != 0)
break;

int cmp0 = redact_block_zb_compare(&buf[0], resume);
int cmpn = redact_block_zb_compare(
&buf[last_entry(rl, bufsize, maxbufid)], resume);
int cmp = redact_block_zb_compare(&rbp, resume);

/*
* If the first block is before or equal to the resume point,
* and the last one is equal or after, then the resume point is
* in this buf, and we should start here.
*/
if (cmp0 <= 0 && cmpn >= 0)
if (cmp == 0) {
minidx = mididx;
break;

if (cmp0 > 0)
maxbufid = midbufid - 1;
else if (cmpn < 0)
minbufid = midbufid + 1;
else
panic("No progress in binary search for resume point");
} else if (cmp > 0) {
maxidx =
(mididx == minidx ? minidx : mididx - 1);
} else {
minidx = mididx + 1;
}
}

for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize);
unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);

unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
uint64_t start_block = minidx / entries_per_buf;
err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
DMU_READ_PREFETCH);

for (uint64_t curidx = minidx;
err == 0 && curidx < rl->rl_phys->rlp_num_entries;
curidx++) {
/*
Expand All @@ -1696,22 +1673,35 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
* prefetching, and this code shouldn't be the bottleneck, so we
* don't need to do manual prefetching.
*/
if (curidx % redact_block_buf_num_entries(bufsize) == 0) {
if (curidx % entries_per_buf == 0) {
err = dmu_read(mos, rl->rl_object, curidx *
sizeof (*buf), bufsize, buf,
DMU_READ_PREFETCH);
if (err != 0)
break;
}
redact_block_phys_t *rb = &buf[curidx %
redact_block_buf_num_entries(bufsize)];
redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
/*
* If resume is non-null, we should either not send the data, or
* null out resume so we don't have to keep doing these
* comparisons.
*/
if (resume != NULL) {
/*
* It is possible that after the binary search we got
* a record before the resume point. There's two cases
* where this can occur. If the record is the last
* redaction record, and the resume point is after the
* end of the redacted data, curidx will be the last
* redaction record. In that case, the loop will end
* after this iteration. The second case is if the
* resume point is between two redaction records, the
* binary search can return either the record before
* or after the resume point. In that case, the next
* iteration will be greater than the resume point.
*/
if (redact_block_zb_compare(rb, resume) < 0) {
ASSERT3U(curidx, ==, minidx);
continue;
} else {
/*
Expand All @@ -1733,8 +1723,10 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
}
}

if (cb(rb, arg) != 0)
if (cb(rb, arg) != 0) {
err = EINTR;
break;
}
}

zio_data_buf_free(buf, bufsize);
Expand Down

0 comments on commit 894fddb

Please sign in to comment.