From ecf85af047cf8ff85abcbc615439e5537bb40dd1 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 22 Oct 2024 09:57:40 +0800 Subject: [PATCH] DAOS-16721 object: fix coll RPC for obj with sparse layout The old implementation did not correctly calculate some collective object RPC size, and may cause trouble when need bulk data transfer for large collective object RPC. It also potentially affects how to dispatch collective RPCs from leader to other engines. The patch also addes more sanity check for coll-punch RPC to detect potential DRAM corruption. Allow-unstable-test: true Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 6 ++- src/dtx/dtx_rpc.c | 5 +- src/object/cli_coll.c | 111 +++++++++++++++++++++++++------------- src/object/cli_shard.c | 7 +-- src/object/obj_internal.h | 8 ++- src/object/obj_utils.c | 2 +- src/object/srv_coll.c | 12 +++-- src/object/srv_obj.c | 8 +-- src/vos/vos_dtx.c | 3 -- 9 files changed, 107 insertions(+), 55 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 863307e9a7f6..7e7c81991551 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -80,8 +80,12 @@ dtx_coll_prep_ult(void *arg) DP_UUID(cont->sc_uuid), DP_RC(rc)); } - if (dcpa->dcpa_result != 0) + if (dcpa->dcpa_result != 0) { + if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n", + DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); goto out; + } dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, dci->dci_version, cont->sc_pool->spc_map_version, diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5423f6c0cc88..2ccbfec2734d 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -974,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (dsp->dsp_mbs == NULL) { rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc < 0 && rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) { + D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", + DP_DTI(&dsp->dsp_xid), DP_RC(rc)); goto out; + } drop = true; goto next; diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index e05abadf3cf4..3285bec58b34 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool if (coa->coa_sparse == 0) coa->coa_dct_cap = obj_ranks; } + + /* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */ + coa->coa_min_rank = obj->cob_min_rank; + D_RWLOCK_UNLOCK(&obj->cob_lock); if (coa->coa_sparse) { @@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool coa->coa_dct_nr = -1; } - coa->coa_max_dct_sz = 0; coa->coa_max_shard_nr = 0; coa->coa_max_bitmap_sz = 0; coa->coa_target_nr = 0; @@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa) coa->coa_dct_nr = 0; } +static void +obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct, + uint32_t *size, bool copy) +{ + struct daos_coll_shard *dcs; + uint32_t dct_size; + int i; + + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (i = 0; i <= dct->dct_max_shard; i++) { + dcs = &dct->dct_shards[i]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (copy) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; +} + +struct obj_coll_tree_args { + struct coll_oper_args *coa; + uint32_t *size; +}; + static int obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) { - struct coll_oper_args *coa = arg; - struct daos_coll_target *dct = val->iov_buf; + struct obj_coll_tree_args *octa = arg; + struct coll_oper_args *coa = octa->coa; + struct daos_coll_target *dct = val->iov_buf; D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap, "Too short pre-allcoated dct_array: %u vs %u\n", coa->coa_dct_nr, coa->coa_dct_cap); + D_ASSERT(dct->dct_bitmap != NULL); - memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct)); + obj_coll_collapse_one(coa, dct, octa->size, true); /* The following members have been migrated into coa->coa_dcts. */ dct->dct_bitmap = NULL; @@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) static int obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) { + struct obj_coll_tree_args octa; struct coll_sparse_targets *tree = coa->coa_tree; int rc = 0; @@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) D_GOTO(out, rc = -DER_NOMEM); coa->coa_sparse = 0; - rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa); + coa->coa_raw_sparse = 1; + coa->coa_dct_nr = 0; + coa->coa_max_dct_sz = 0; + + octa.coa = coa; + octa.size = size; + rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, + obj_coll_tree_cb, &octa); if (rc == 0) D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap, "Something is wrong when prepare coll target array: %u vs %u\n", @@ -287,36 +336,13 @@ static int obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size) { struct daos_coll_target *dct; - struct daos_coll_shard *dcs; - uint32_t dct_size; int i; - int j; - for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0; + i < coa->coa_dct_cap; i++) { dct = &coa->coa_dcts[i]; - if (dct->dct_bitmap != NULL) { - /* The size may be over estimated, no matter. */ - dct_size = sizeof(*dct) + dct->dct_bitmap_sz + - sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); - - for (j = 0; j <= dct->dct_max_shard; j++) { - dcs = &dct->dct_shards[j]; - if (dcs->dcs_nr > 1) - dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; - } - - if (coa->coa_for_modify) - dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; - - if (coa->coa_max_dct_sz < dct_size) - coa->coa_max_dct_sz = dct_size; - - if (coa->coa_dct_nr < i) - memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); - - coa->coa_dct_nr++; - *size += dct_size; - } + if (dct->dct_bitmap != NULL) + obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i); } /* Reset the other dct slots to avoid double free during cleanup. */ @@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, D_RWLOCK_RDLOCK(&obj->cob_lock); - D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank, - "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank); + D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank, + "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n", + DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver); D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank, "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank); @@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo uint32_t tgt_size = 0; uint32_t mbs_max_size; uint32_t inline_size; - uint32_t flags = ORF_LEADER; uint32_t leader = -1; uint32_t len; int rc; @@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo memcpy(dct, &tmp_tgt, sizeof(tmp_tgt)); } + /* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */ + D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank, + "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n", + DP_OID(obj->cob_md.omd_id), shard->do_target_rank, + coa->coa_dcts[0].dct_rank, coa->coa_dct_nr); + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); if (rc < 0) goto out; @@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc != 0) goto out; + auxi->flags = ORF_LEADER; if (auxi->io_retry) { - flags |= ORF_RESEND; + auxi->flags |= ORF_RESEND; /* Reset @enqueue_id if resend to new leader. */ if (spa->pa_auxi.target != shard->do_target_id) spa->pa_auxi.enqueue_id = 0; } else { + auxi->flags &= ~ORF_RESEND; spa->pa_auxi.obj_auxi = auxi; daos_dti_gen(&spa->pa_dti, false); } @@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo spa->pa_auxi.shard = shard->do_shard_idx; if (obj_is_ec(obj)) - flags |= ORF_EC; + auxi->flags |= ORF_EC; mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, - args->flags, flags, map_ver, &auxi->map_ver_reply, task); + args->flags, auxi->flags, map_ver, + &auxi->map_ver_reply, task); out: if (rc > 0) diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 36c5c5f1e0c3..0c9dfc1418e5 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1453,9 +1453,10 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), - DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, - (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, + *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ae5c9c82fd18..3800d53076f0 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -293,10 +293,16 @@ struct coll_oper_args { struct shard_auxi_args coa_auxi; int coa_dct_nr; uint32_t coa_dct_cap; - uint32_t coa_max_dct_sz; + union { + /* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */ + uint32_t coa_min_rank; + /* Be used since obj_coll_oper_args_collapse(), will overwrite obj->cob_min_rank. */ + uint32_t coa_max_dct_sz; + }; uint8_t coa_max_shard_nr; uint8_t coa_max_bitmap_sz; uint8_t coa_for_modify:1, + coa_raw_sparse:1, coa_sparse:1; uint8_t coa_target_nr; /* diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 7bf0ef4aaf93..82d91c966acc 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -633,7 +633,7 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t * use the "cur_pos" as the relay engine. */ pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; - if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { + if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); memcpy(dct, &tmp, sizeof(tmp)); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 2a152b47bd64..5b59f954f865 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct int i; int j; - /* dcts[0] is for current engine. */ - if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || - dcts[0].dct_shards == NULL) { + /* dcts[0] must be for current engine. */ + if (unlikely(dcts[0].dct_rank != dss_self_rank())) { + D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n", + dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags); + D_GOTO(out, rc = -DER_INVAL); + } + + if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL)) { D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n", dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz, dcts[0].dct_shards == NULL ? "empty" : "non-empty"); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a51682b47853..2f4d91f0020d 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -5605,12 +5605,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " - DF_DTI", forward width %u, forward depth %u\n", + DF_DTI", forward width %u, forward depth %u, flags %x\n", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); D_ASSERT(dmi->dmi_xs_id != 0); @@ -5747,13 +5747,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u", + "tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); obj_punch_complete(rpc, rc, max_ver); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 460abfeec57d..0a325088a778 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1972,9 +1972,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = -DER_INPROGRESS; } - if (rc < 0) - D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); - return rc; }