Skip to content

Commit

Permalink
Backport two changes that can improve rebuild stability (#14452)
Browse files Browse the repository at this point in the history
DAOS-14010 vos: Allow rebuild overwrite (#13326)
DAOS-15670 vos: SV overwrite missed tx_add_range() (#14241)

* Partial overwrite is difficult for upper layer to avoid.  If we
reserve a range of minor epochs for rebuild, we can just always allow
rebuild to overwrite what any extents currently in the tree for the
given major epoch.

* In SV overwerite case, the btr_update_record() will defer free
the original record and allocate new record for record replacing,
however, btr_node_tx_add() is mistakenly skipped in btr_update(),
that leads to:
1. In md-on-ssd mode, tree node changes are missed in WAL.
2. In pmem mode, tree node snapshot is missed in undo log.

Signed-off-by: Jeff Olivier <[email protected]>
Signed-off-by: Niu Yawei <[email protected]>
  • Loading branch information
jolivier23 authored Jun 5, 2024
1 parent 533ce74 commit 856daa0
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 93 deletions.
15 changes: 10 additions & 5 deletions src/common/btree.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1994,27 +1994,32 @@ btr_update(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out
struct btr_record *rec;
int rc;
char sbuf[BTR_PRINT_BUF];
struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1];

rec = btr_trace2rec(tcx, tcx->tc_depth - 1);

D_DEBUG(DB_TRACE, "Update record %s\n",
btr_rec_string(tcx, rec, true, sbuf, BTR_PRINT_BUF));

rc = btr_rec_update(tcx, rec, key, val, val_out);
if (rc == -DER_NO_PERM) { /* cannot make inplace change */
struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1];

if (rc == -DER_NO_PERM) {
D_DEBUG(DB_TRACE, "Replace the original record\n");
if (btr_has_tx(tcx)) {
rc = btr_node_tx_add(tcx, trace->tr_node);
if (rc != 0)
goto out;
}

D_DEBUG(DB_TRACE, "Replace the original record\n");
rc = btr_rec_free(tcx, rec, NULL);
if (rc)
goto out;
rc = btr_rec_alloc(tcx, key, val, rec, val_out);
} else if (rc == 1) {
D_DEBUG(DB_TRACE, "Replace the record by btr_rec_update()\n");
if (btr_has_tx(tcx))
rc = btr_node_tx_add(tcx, trace->tr_node);
else
rc = 0;
}
out:
if (rc != 0) { /* failed */
Expand Down
6 changes: 1 addition & 5 deletions src/include/daos_srv/evtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,11 +376,7 @@ struct evt_entry_array {
/** Maximum size of array */
uint32_t ea_max;
/** Number of bytes per index */
uint32_t ea_inob;
/** Index of first delete record, valid if ea_delete_nr != 0 */
uint32_t ea_first_delete;
/** Number of delete records */
uint32_t ea_delete_nr;
uint32_t ea_inob;
/* Small array of embedded entries */
struct evt_list_entry ea_embedded_ents[0];
};
Expand Down
2 changes: 1 addition & 1 deletion src/include/daos_srv/vos_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <daos/dtx.h>
#include <daos/checksum.h>

#define VOS_SUB_OP_MAX ((uint16_t)-2)
#define VOS_SUB_OP_MAX (((uint16_t)-1) >> 2)

#define VOS_POOL_DF_2_2 24
#define VOS_POOL_DF_2_4 25
Expand Down
15 changes: 15 additions & 0 deletions src/vos/evt_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@
#include <daos_srv/evtree.h>
#include "vos_internal.h"

/** Upper two bits are reserved. On disk, this will be (uint16_t)-2)
* for backward compatibility reasons.
* For the upper bits,
* 00 means distributed transaction write
* 11 is reserved for max normal write and removal records
* 01 is reserved for rebuild writes
* 10 is reserved for future use
*
* When converting to not durable values, we want all rebuild records to be
* greater than any normal write records. So, we use a smaller value here.
*/
#define EVT_TX_MINOR_MAX_DF ((uint16_t)-2)
#define EVT_REBUILD_MINOR_MIN (VOS_SUB_OP_MAX + 1)
#define EVT_REBUILD_MINOR_MAX (EVT_REBUILD_MINOR_MIN + VOS_SUB_OP_MAX)

/**
* Tree node types.
* NB: a node can be both root and leaf.
Expand Down
134 changes: 70 additions & 64 deletions src/vos/evtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ static inline void
evt_rect_read(struct evt_rect *rout, const struct evt_rect_df *rin)
{
rout->rc_epc = rin->rd_epc;
rout->rc_minor_epc = rin->rd_minor_epc;
if (rin->rd_minor_epc != EVT_TX_MINOR_MAX_DF)
rout->rc_minor_epc = rin->rd_minor_epc;
else
rout->rc_minor_epc = VOS_SUB_OP_MAX;
evt_ext_read(&rout->rc_ex, rin);
};

Expand All @@ -52,8 +55,11 @@ evt_rect_write(struct evt_rect_df *rout, const struct evt_rect *rin)
{
evt_len_write(rout, evt_rect_width(rin));
rout->rd_epc = rin->rc_epc;
rout->rd_minor_epc = rin->rc_minor_epc;
rout->rd_lo = rin->rc_ex.ex_lo;
if (rin->rc_minor_epc != VOS_SUB_OP_MAX)
rout->rd_minor_epc = rin->rc_minor_epc;
else
rout->rd_minor_epc = EVT_TX_MINOR_MAX_DF;
};

#define DF_BUF_LEN 128
Expand Down Expand Up @@ -159,10 +165,20 @@ time_cmp(uint64_t t1, uint64_t t2, int *out)
return true;
}

if (t1 < t2)
if (t1 < t2) {
/** Even if t2 is rebuild, overwrite just works here */
*out = RT_OVERLAP_OVER;
else
} else {
*out = RT_OVERLAP_UNDER;
if (t2 == EVT_REBUILD_MINOR_MIN) {
/** If t1 is also a rebuild, we should return
* RT_OVERLAP_SAME to force adjustment of new rebuild
* minor epoch.
*/
if (t1 < EVT_REBUILD_MINOR_MAX)
*out = RT_OVERLAP_SAME;
}
}

return false;
}
Expand Down Expand Up @@ -2249,6 +2265,7 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
struct evt_filter filter;
int rc;
int alt_rc = 0;
bool overwrite = false;

tcx = evt_hdl2tcx(toh);
if (tcx == NULL)
Expand Down Expand Up @@ -2279,6 +2296,13 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,

evt_ent_array_init(ent_array, 1);

/* For evt_remove_all, we only insert removals for things that
* are visible, so we shouldn't need to check for overwrite at
* all.
*/
if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX)
goto run_tx;

filter.fr_ex = entry->ei_rect.rc_ex;
filter.fr_epr.epr_lo = entry->ei_rect.rc_epc;
filter.fr_epr.epr_hi = entry->ei_bound;
Expand All @@ -2293,61 +2317,41 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
return rc;

if (ent_array->ea_ent_nr == 1) {
if (entry->ei_rect.rc_minor_epc == EVT_MINOR_EPC_MAX) {
/** Special case. This is an overlapping delete record
* which can happen when there are minor epochs
* involved. Rather than rejecting, insert prefix
* and/or suffix extents.
ent = evt_ent_array_get(ent_array, 0);
if (entry->ei_rect.rc_minor_epc == EVT_REBUILD_MINOR_MIN) {
ent_cpy = *entry;
ent_cpy.ei_rect.rc_minor_epc = ent->en_minor_epc + 1;
D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc <= EVT_REBUILD_MINOR_MAX,
"minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc);
/* This should never happen because otherwise, we would
* return no overlap.
*/
ent = evt_ent_array_get(ent_array, 0);
if (ent->en_ext.ex_lo <= entry->ei_rect.rc_ex.ex_lo &&
ent->en_ext.ex_hi >= entry->ei_rect.rc_ex.ex_hi) {
/** Nothing to do, existing extent contains
* the new one
*/
return 0;
}
D_ASSERTF(ent_cpy.ei_rect.rc_minor_epc > EVT_REBUILD_MINOR_MIN,
"minor_epc=%d\n", ent_cpy.ei_rect.rc_minor_epc);
entryp = &ent_cpy;
goto run_tx;
}
overwrite = true;
}

run_tx:
rc = evt_tx_begin(tcx);
if (rc != 0)
return rc;

if (tcx->tc_depth == 0) { /* empty tree */
rc = evt_root_activate(tcx, entry);
rc = evt_root_activate(tcx, entryp);
if (rc != 0)
goto out;
} else if (tcx->tc_inob == 0 && entry->ei_inob != 0) {
} else if (tcx->tc_inob == 0 && entryp->ei_inob != 0) {
rc = evt_root_tx_add(tcx);
if (rc != 0)
goto out;
tcx->tc_inob = tcx->tc_root->tr_inob = entry->ei_inob;
tcx->tc_inob = tcx->tc_root->tr_inob = entryp->ei_inob;
}

D_ASSERT(ent_array->ea_ent_nr <= 1);
if (ent_array->ea_ent_nr == 1) {
if (ent != NULL) {
memcpy(&ent_cpy, entry, sizeof(*entry));
entryp = &ent_cpy;
/** We need to edit the existing extent */
if (entry->ei_rect.rc_ex.ex_lo < ent->en_ext.ex_lo) {
ent_cpy.ei_rect.rc_ex.ex_hi = ent->en_ext.ex_lo - 1;
if (entry->ei_rect.rc_ex.ex_hi <= ent->en_ext.ex_hi)
goto insert;
/* There is also a suffix, so insert the prefix */
rc = evt_insert_entry(tcx, entryp, csum_bufp);
if (rc != 0)
goto out;
}

D_ASSERT(entry->ei_rect.rc_ex.ex_hi > ent->en_ext.ex_hi);
ent_cpy.ei_rect.rc_ex.ex_hi = entry->ei_rect.rc_ex.ex_hi;
ent_cpy.ei_rect.rc_ex.ex_lo = ent->en_ext.ex_hi + 1;

/* Now insert the suffix */
goto insert;
}
if (overwrite) {
/*
* NB: This is part of the current hack to keep "supporting"
* overwrite for same epoch, full overwrite.
Expand All @@ -2358,7 +2362,6 @@ evt_insert(daos_handle_t toh, const struct evt_entry_in *entry,
goto out;
}

insert:
/* Phase-2: Inserting */
rc = evt_insert_entry(tcx, entryp, csum_bufp);

Expand Down Expand Up @@ -2584,7 +2587,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
int at;
int i;
int rc = 0;
bool has_agg = false;
bool has_agg = false;

V_TRACE(DB_TRACE, "Searching rectangle "DF_RECT" opc=%d\n",
DP_RECT(rect), find_opc);
Expand Down Expand Up @@ -2704,27 +2707,29 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
" overlaps with " DF_RECT "\n",
DP_RECT(&rtmp), DP_RECT(rect));

/* NB: This is temporary to allow full overwrite
* in same epoch to avoid breaking rebuild.
* Without some sequence number and client
* identifier, we can't do this robustly.
* There can be a race between rebuild and
* client doing different updates. But this
* isn't any worse than what we already have in
* place so I did it this way to minimize
* change while we decide how to handle this
* properly.
*/
if (range_overlap != RT_OVERLAP_SAME) {
D_ERROR("Same epoch partial "
"overwrite not supported:"
DF_RECT" overlaps with "DF_RECT
"\n", DP_RECT(rect),
DP_RECT(&rtmp));
rc = -DER_VOS_PARTIAL_UPDATE;
goto out;
if (rect->rc_minor_epc != EVT_REBUILD_MINOR_MIN) {
if (range_overlap != RT_OVERLAP_SAME) {
D_ERROR("Same epoch partial overwrite not "
"supported: " DF_RECT
" overlaps with " DF_RECT "\n",
DP_RECT(rect), DP_RECT(&rtmp));
rc = -DER_VOS_PARTIAL_UPDATE;
goto out;
}
}
break; /* we can update the record in place */

if (ent_array->ea_ent_nr == 0)
break;

/** If the extent is an overlap, partial or
* otherwise, we want to keep only the one with
* the highest minor epoch and let the caller
* deal with it.
*/
ent = evt_ent_array_get(ent_array, 0);
if (ent->en_minor_epc > rtmp.rc_minor_epc)
continue;
goto replace_ent;
case EVT_FIND_SAME:
if (range_overlap != RT_OVERLAP_SAME)
continue;
Expand Down Expand Up @@ -2762,6 +2767,7 @@ evt_ent_array_fill(struct evt_context *tcx, enum evt_find_opc find_opc,
goto out;
}

replace_ent:
evt_entry_fill(tcx, node, i, rect, intent, ent);
switch (find_opc) {
default:
Expand Down
6 changes: 4 additions & 2 deletions src/vos/tests/evt_ctl.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash

if [ "$USE_VALGRIND" = "memcheck" ]; then
VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \
if [ "$USE_VALGRIND" = "memcheck" ]; then VCMD="valgrind --leak-check=full --show-reachable=yes --error-limit=no \
--suppressions=${VALGRIND_SUPP} --error-exitcode=42 --xml=yes \
--xml-file=unit-test-evt_ctl-%p.memcheck.xml"
elif [ "$USE_VALGRIND" = "pmemcheck" ]; then
Expand Down Expand Up @@ -167,6 +166,9 @@ cmd+=" -C o:5 -a 0-1@1:ab -a 1-2@2:cd -a 3-4@3:bc -a 5-7@4:def -a 6-8@5:xyz"
cmd+=" -a 1-2@6:aa -a 4-7@7:abcd -b -2 -r 0-5@8 -b -2 -r 0-5@3 -b -2 -D"
cmd+=" -C o:4 -a [email protected]:abcdef -a [email protected]:fedcba -a [email protected]:foo -r 2-8@0-1"
cmd+=" -l0-10@0-10:c -r 7-9@0-1 -l0-10@0-10:C -b -2 -D"
cmd+=" -C o:15 -a [email protected]:abcdef -a [email protected]:ff -a [email protected]:abc -a [email protected]:ab"
cmd+=" -a [email protected]:abc -a [email protected]:vab -a [email protected]:ab -a [email protected]:a"
cmd+=" -b -2 -D"
echo "$cmd"
eval "$cmd"
result="${PIPESTATUS[0]}"
Expand Down
3 changes: 1 addition & 2 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
#include "vos_ilog.h"
#include "vos_obj.h"

#define VOS_MINOR_EPC_MAX (VOS_SUB_OP_MAX + 1)
D_CASSERT(VOS_MINOR_EPC_MAX == EVT_MINOR_EPC_MAX);
#define VOS_MINOR_EPC_MAX EVT_MINOR_EPC_MAX

#define VOS_TX_LOG_FAIL(rc, ...) \
do { \
Expand Down
26 changes: 13 additions & 13 deletions src/vos/vos_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,11 @@ struct vos_io_context {
/** the total size of the IO */
uint64_t ic_io_size;
/** flags */
unsigned int ic_update:1,
ic_size_fetch:1,
ic_save_recx:1,
ic_dedup:1, /** candidate for dedup */
ic_dedup_verify:1,
ic_read_ts_only:1,
ic_check_existence:1,
ic_remove:1,
ic_skip_fetch:1,
ic_agg_needed:1,
ic_ec:1; /**< see VOS_OF_EC */
unsigned int ic_update : 1, ic_size_fetch : 1, ic_save_recx : 1,
ic_dedup : 1, /** candidate for dedup */
ic_dedup_verify : 1, ic_read_ts_only : 1, ic_check_existence : 1, ic_remove : 1,
ic_skip_fetch : 1, ic_agg_needed : 1, ic_skip_akey_support : 1, ic_rebuild : 1,
ic_ec : 1; /**< see VOS_OF_EC */
/**
* Input shadow recx lists, one for each iod. Now only used for degraded
* mode EC obj fetch handling.
Expand Down Expand Up @@ -639,6 +633,7 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only,
ioc->ic_read_ts_only = 1;
ioc->ic_remove = ((vos_flags & VOS_OF_REMOVE) != 0);
ioc->ic_ec = ((vos_flags & VOS_OF_EC) != 0);
ioc->ic_rebuild = ((vos_flags & VOS_OF_REBUILD) != 0);
ioc->ic_umoffs_cnt = ioc->ic_umoffs_at = 0;
ioc->ic_iod_csums = iod_csums;
vos_ilog_fetch_init(&ioc->ic_dkey_info);
Expand Down Expand Up @@ -2296,6 +2291,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err,
struct vos_io_context *ioc = vos_ioh2ioc(ioh);
struct umem_instance *umem;
bool tx_started = false;
uint16_t minor_epc;

D_ASSERT(ioc->ic_update);
vos_dedup_verify_fini(ioh);
Expand Down Expand Up @@ -2340,9 +2336,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err,
if (err != 0)
goto abort;

if (dtx_is_valid_handle(dth))
minor_epc = dth->dth_op_seq;
else
minor_epc = ioc->ic_rebuild ? EVT_REBUILD_MINOR_MIN : VOS_SUB_OP_MAX;

/* Update tree index */
err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ?
dth->dth_op_seq : VOS_SUB_OP_MAX);
err = dkey_update(ioc, pm_ver, dkey, minor_epc);
if (err) {
VOS_TX_LOG_FAIL(err, "Failed to update tree index: "DF_RC"\n",
DP_RC(err));
Expand Down
Loading

0 comments on commit 856daa0

Please sign in to comment.