diff --git a/ci/patch_src_in_place b/ci/patch_src_in_place index 7f49ab77a55..2a51eabac27 100755 --- a/ci/patch_src_in_place +++ b/ci/patch_src_in_place @@ -1,7 +1,7 @@ #!/bin/sh # shellcheck disable=SC2046,SC2035 -codespell -w --ignore-words-list nd,uint,ths,ba,creat,te,cas,mapp,pres,crashers,dout,tre,reord,mimick,cloneable,keypair,bject,tread,cancelled --builtin clear,rare,informal,names,en-GB_to_en-US --skip *.png,*.PNG,*.pyc,src/rdb/raft/*,src/control/vendor/*,RSA.golden $(git ls-tree --full-tree --name-only HEAD) +codespell -w --ignore-words-list dedup,nd,uint,ths,ba,creat,te,cas,mapp,pres,crashers,dout,tre,reord,mimick,cloneable,keypair,bject,tread,cancelled --builtin clear,rare,informal,names,en-GB_to_en-US --skip *.png,*.PNG,*.pyc,src/rdb/raft/*,src/control/vendor/*,RSA.golden $(git ls-tree --full-tree --name-only HEAD) # The return code of codespell is the number of works it could not correct # because of multiple options. We could report on these but they're rare diff --git a/ci/unit/test_main_node.sh b/ci/unit/test_main_node.sh index a3b726ad4a0..48b5222a9e0 100755 --- a/ci/unit/test_main_node.sh +++ b/ci/unit/test_main_node.sh @@ -3,7 +3,7 @@ # This is a script to be run by the unit/test_main.sh to run a test # on a CI node. -set -ex +set -x sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq' if grep /mnt/daos\ /proc/mounts; then diff --git a/src/common/mem.c b/src/common/mem.c index 35db0d02de9..5f32fbd88ea 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -263,6 +263,15 @@ pmem_tx_commit(struct umem_instance *umm) return rc ? umem_tx_errno(rc) : 0; } +static void +pmem_defer_free(struct umem_instance *umm, umem_off_t off, + struct pobj_action *act) +{ + PMEMoid id = umem_off2id(umm, off); + + pmemobj_defer_free(umm->umm_pool, id, act); +} + static umem_off_t pmem_reserve(struct umem_instance *umm, struct pobj_action *act, size_t size, unsigned int type_num) @@ -359,6 +368,7 @@ static umem_ops_t pmem_ops = { .mo_tx_begin = pmem_tx_begin, .mo_tx_commit = pmem_tx_commit, .mo_reserve = pmem_reserve, + .mo_defer_free = pmem_defer_free, .mo_cancel = pmem_cancel, .mo_tx_publish = pmem_tx_publish, .mo_tx_add_callback = pmem_tx_add_callback, diff --git a/src/container/cli_internal.h b/src/container/cli_internal.h index 4b4e33bfb45..7dceca8b130 100644 --- a/src/container/cli_internal.h +++ b/src/container/cli_internal.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016 Intel Corporation. + * (C) Copyright 2016-2020 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index d55ac3f3159..d5eefd2ca69 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -313,23 +313,13 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_modification_cnt = sub_modification_cnt; dth->dth_op_seq = 0; - dth->dth_rsrvd_cnt = 0; dth->dth_oid_cnt = 0; dth->dth_oid_cap = 0; dth->dth_oid_array = NULL; dth->dth_dkey_hash = 0; - if (sub_modification_cnt <= 1) { - dth->dth_rsrvds = &dth->dth_rsrvd_inline; - return 0; - } - - D_ALLOC_ARRAY(dth->dth_rsrvds, sub_modification_cnt); - if (dth->dth_rsrvds == NULL) - return -DER_NOMEM; - - return 0; + return vos_dtx_rsrvd_init(dth); } static int @@ -764,8 +754,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, D_FREE(dlh->dlh_subs); D_FREE(dth->dth_oid_array); - if (dth->dth_rsrvds != &dth->dth_rsrvd_inline) - D_FREE(dth->dth_rsrvds); + vos_dtx_rsrvd_fini(dth); return result; } diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 0521fbe4a55..8e42bf2726c 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019 Intel Corporation. + * (C) Copyright 2019-2020 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,4 +117,5 @@ int dtx_abort(uuid_t po_uuid, uuid_t co_uuid, daos_epoch_t epoch, struct dtx_entry **dtes, int count); int dtx_check(uuid_t po_uuid, uuid_t co_uuid, struct dtx_entry *dte); + #endif /* __DTX_INTERNAL_H__ */ diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index 621211f7202..09a122694aa 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -232,6 +232,18 @@ typedef struct { struct pobj_action *act, size_t size, unsigned int type_num); + /** + * Defer free til commit. For use with reserved extents that are not + * yet published. For VMEM, it just calls free. + * + * \param umm [IN] umem class instance. + * \param off [IN] offset of allocation + * \param act [IN|OUT] action used for later cancel/publish. + */ + void (*mo_defer_free)(struct umem_instance *umm, + umem_off_t off, + struct pobj_action *act); + /** * Cancel the reservation. * @@ -494,6 +506,20 @@ umem_reserve(struct umem_instance *umm, struct pobj_action *act, size_t size) return UMOFF_NULL; } +static inline void +umem_defer_free(struct umem_instance *umm, umem_off_t off, + struct pobj_action *act) +{ + if (umm->umm_ops->mo_defer_free) + return umm->umm_ops->mo_defer_free(umm, off, act); + + /** Go ahead and free immediately. The purpose of this function + * is to allow reserve/publish pair to execute on commit + */ + umem_free(umm, off); +} + + static inline void umem_cancel(struct umem_instance *umm, struct pobj_action *actv, int actv_cnt) { diff --git a/src/include/daos_obj_class.h b/src/include/daos_obj_class.h index 04942c5e5f6..31032dad040 100644 --- a/src/include/daos_obj_class.h +++ b/src/include/daos_obj_class.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2015-2019 Intel Corporation. + * (C) Copyright 2015-2020 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7b2cadb1493..b1eac285bda 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -87,7 +87,8 @@ struct dtx_handle { /** The flags, see dtx_entry_flags. */ uint32_t dth_flags; /** The count of reserved items in the dth_rsrvds array. */ - uint32_t dth_rsrvd_cnt; + uint16_t dth_rsrvd_cnt; + uint16_t dth_deferred_cnt; /** The total sub modifications count. */ uint16_t dth_modification_cnt; /** Modification sequence in the distributed transaction. */ @@ -105,6 +106,7 @@ struct dtx_handle { struct dtx_rsrvd_uint dth_rsrvd_inline; struct dtx_rsrvd_uint *dth_rsrvds; + void **dth_deferred; }; /* Each sub transaction handle to manage each sub thandle */ diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index a87d4badd8d..f2efa4a074d 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -37,6 +37,23 @@ #include #include +/** Initialize the vos reserve/cancel related fields in dtx handle + * + * \param dth [IN] The dtx handle + * + * \return 0 on success + * -DER_NOMEM on failure + */ +int +vos_dtx_rsrvd_init(struct dtx_handle *dth); + +/** Finalize the vos reserve/cancel related fields in dtx handle + * + * \param dth [IN] The dtx handle + */ +void +vos_dtx_rsrvd_fini(struct dtx_handle *dth); + /** * Check the specified DTX's status, and related epoch, pool map version * information if required. diff --git a/src/tests/suite/daos_test.c b/src/tests/suite/daos_test.c index 771c1a7b4dc..41cd435ded8 100644 --- a/src/tests/suite/daos_test.c +++ b/src/tests/suite/daos_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2019 Intel Corporation. + * (C) Copyright 2016-2020 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/vos/tests/vts_common.h b/src/vos/tests/vts_common.h index 59bf33e3e5a..82646f97019 100644 --- a/src/vos/tests/vts_common.h +++ b/src/vos/tests/vts_common.h @@ -145,8 +145,9 @@ vts_dtx_begin_ex(const daos_unit_oid_t *oid, daos_handle_t coh, dth = *dthp; dth->dth_modification_cnt = nmods; - D_ALLOC_ARRAY(dth->dth_rsrvds, nmods); - assert_ptr_not_equal(dth->dth_rsrvds, NULL); + + /** first call in vts_dtx_begin will have set this to inline */ + assert_int_equal(vos_dtx_rsrvd_init(dth), 0); } void diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index bc187b27be6..e3845569671 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -86,13 +86,13 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, dth->dth_modification_cnt = 1; dth->dth_op_seq = 1; - dth->dth_rsrvd_cnt = 0; dth->dth_oid_cnt = 0; dth->dth_oid_cap = 0; dth->dth_oid_array = NULL; dth->dth_dkey_hash = dkey_hash; - dth->dth_rsrvds = &dth->dth_rsrvd_inline; + + vos_dtx_rsrvd_init(dth); *dthp = dth; } @@ -100,8 +100,7 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, void vts_dtx_end(struct dtx_handle *dth) { - if (dth->dth_modification_cnt > 1) - D_FREE(dth->dth_rsrvds); + vos_dtx_rsrvd_fini(dth); D_FREE(dth->dth_dte.dte_mbs); D_FREE_PTR(dth); } diff --git a/src/vos/tests/vts_mvcc.c b/src/vos/tests/vts_mvcc.c index f206b8ab60d..dc35ad2131b 100644 --- a/src/vos/tests/vts_mvcc.c +++ b/src/vos/tests/vts_mvcc.c @@ -33,6 +33,17 @@ #include "vts_io.h" #include "vts_array.h" +struct tx_helper { + /** Current transaction handle */ + struct dtx_handle *th_dth; + /** Number of total ops in current tx */ + uint32_t th_nr_ops; + /** Number of write ops in current tx */ + uint32_t th_nr_mods; + /** Current op number */ + uint32_t th_op_seq; +}; + struct mvcc_arg { int i; /* used to generate different oids, etc. */ daos_epoch_t epoch; /* used to generate different epochs */ @@ -73,7 +84,7 @@ enum write_type { W_NIL /* not applicable */ }; -typedef int (*op_func_t)(struct io_test_args *arg, struct dtx_id *dti, +typedef int (*op_func_t)(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch); struct op { @@ -183,25 +194,75 @@ set_value(int i, char *path, d_iov_t *value) value->iov_len = strlen(value->iov_buf) + 1; } +static struct dtx_handle * +start_tx(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, + struct tx_helper *txh) +{ + struct dtx_handle *dth; + + if (txh == NULL) + return NULL; + + dth = txh->th_dth; + + if (dth == NULL) { + vts_dtx_begin_ex(&oid, coh, epoch, 0, txh->th_nr_mods, + &dth); + txh->th_dth = dth; + } + + return dth; +} + +static void +stop_tx(daos_handle_t coh, struct tx_helper *txh, bool success, bool write) +{ + struct dtx_handle *dth; + struct dtx_id xid; + int err; + + if (txh == NULL) + return; + + dth = txh->th_dth; + + if (write) + dth->dth_op_seq++; + + if (txh->th_nr_ops == txh->th_op_seq) { + xid = dth->dth_xid; + vts_dtx_end(dth); + if (txh->th_nr_mods != 0) { + if (success) { + err = vos_dtx_commit(coh, &xid, 1, NULL); + assert_int_equal(err, 1); + } + } + } + + txh->th_op_seq++; +} + static int -tx_fetch(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t oid, +tx_fetch(daos_handle_t coh, struct tx_helper *txh, daos_unit_oid_t oid, daos_epoch_t epoch, uint64_t flags, daos_key_t *dkey, unsigned int iod_nr, daos_iod_t *iod, d_sg_list_t *sgl) { struct dtx_handle *dth; int rc; - vts_dtx_begin(&oid, coh, epoch, 0, &dth); - if (dti != NULL) - dth->dth_xid = *dti; + dth = start_tx(coh, oid, epoch, txh); + rc = vos_obj_fetch_ex(coh, oid, epoch, flags, dkey, iod_nr, iod, sgl, dth); - vts_dtx_end(dth); + + stop_tx(coh, txh, rc == 0, false); + return rc; } static int -fetch_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, +fetch_with_flags(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch, uint64_t flags) { struct mvcc_arg *mvcc_arg = arg->custom; @@ -229,47 +290,38 @@ fetch_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, sgl.sg_nr = 1; sgl.sg_iovs = &value; - return tx_fetch(arg->ctx.tc_co_hdl, dti, oid, epoch, flags, &dkey, + return tx_fetch(arg->ctx.tc_co_hdl, txh, oid, epoch, flags, &dkey, 1 /* iod_nr */, &iod, &sgl); } static int -fetch_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +fetch_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return fetch_with_flags(arg, dti, path, epoch, 0 /* flags */); + return fetch_with_flags(arg, txh, path, epoch, 0 /* flags */); } static int -tx_update(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t oid, +tx_update(daos_handle_t coh, struct tx_helper *txh, daos_unit_oid_t oid, daos_epoch_t epoch, uint64_t flags, daos_key_t *dkey, unsigned int iod_nr, daos_iod_t *iod, d_sg_list_t *sgl) { struct dtx_handle *dth; - struct dtx_id xid; int rc; - vts_dtx_begin(&oid, coh, epoch, 0, &dth); - if (dti != NULL) - dth->dth_xid = *dti; + dth = start_tx(coh, oid, epoch, txh); + rc = vos_obj_update_ex(coh, oid, epoch, 0 /* pm_ver */, flags, dkey, iod_nr, iod, NULL /* iods_csums */, sgl, dth); - xid = dth->dth_xid; - vts_dtx_end(dth); - - if (rc == 0) { - int err; - err = vos_dtx_commit(coh, &xid, 1, NULL); - assert_int_equal(err, 1); - } + stop_tx(coh, txh, rc == 0, true); return rc; } static int -update_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_with_flags(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch, uint64_t flags) { struct mvcc_arg *mvcc_arg = arg->custom; @@ -298,78 +350,69 @@ update_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, sgl.sg_nr = 1; sgl.sg_iovs = &value; - return tx_update(arg->ctx.tc_co_hdl, dti, oid, epoch, flags, &dkey, + return tx_update(arg->ctx.tc_co_hdl, txh, oid, epoch, flags, &dkey, 1 /* iod_nr */, &iod, &sgl); } static int -update_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return update_with_flags(arg, dti, path, epoch, 0 /* flags */); + return update_with_flags(arg, txh, path, epoch, 0 /* flags */); } static int -update_de_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_de_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return update_with_flags(arg, dti, path, epoch, + return update_with_flags(arg, txh, path, epoch, VOS_OF_COND_DKEY_INSERT); } static int -update_dne_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_dne_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return update_with_flags(arg, dti, path, epoch, + return update_with_flags(arg, txh, path, epoch, VOS_OF_COND_DKEY_UPDATE); } static int -update_ae_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_ae_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return update_with_flags(arg, dti, path, epoch, + return update_with_flags(arg, txh, path, epoch, VOS_OF_COND_AKEY_INSERT); } static int -update_ane_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +update_ane_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return update_with_flags(arg, dti, path, epoch, + return update_with_flags(arg, txh, path, epoch, VOS_OF_COND_AKEY_UPDATE); } static int -tx_punch(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t oid, +tx_punch(daos_handle_t coh, struct tx_helper *txh, daos_unit_oid_t oid, daos_epoch_t epoch, uint64_t flags, daos_key_t *dkey, unsigned int akey_nr, daos_key_t *akeys) { struct dtx_handle *dth; - struct dtx_id xid; int rc; - vts_dtx_begin(&oid, coh, epoch, 0, &dth); - if (dti != NULL) - dth->dth_xid = *dti; + dth = start_tx(coh, oid, epoch, txh); + rc = vos_obj_punch(coh, oid, epoch, 0 /* pm_ver */, flags, dkey, akey_nr, akeys, dth); - xid = dth->dth_xid; - vts_dtx_end(dth); - - if (rc == 0) { - int err; - err = vos_dtx_commit(coh, &xid, 1, NULL); - assert_int_equal(err, 1); - } + stop_tx(coh, txh, rc == 0, true); return rc; } static int -puncho_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncho_with_flags(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch, uint64_t flags) { struct mvcc_arg *mvcc_arg = arg->custom; @@ -377,26 +420,26 @@ puncho_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, set_oid(mvcc_arg->i, path, &oid); - return tx_punch(arg->ctx.tc_co_hdl, dti, oid, epoch, flags, + return tx_punch(arg->ctx.tc_co_hdl, txh, oid, epoch, flags, NULL /* dkey */, 0 /* akey_nr */, NULL /* akeys */); } static int -puncho_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncho_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return puncho_with_flags(arg, dti, path, epoch, 0 /* flags */); + return puncho_with_flags(arg, txh, path, epoch, 0 /* flags */); } static int -puncho_one_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncho_one_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return puncho_with_flags(arg, dti, path, epoch, VOS_OF_COND_PUNCH); + return puncho_with_flags(arg, txh, path, epoch, VOS_OF_COND_PUNCH); } static int -punchd_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, +punchd_with_flags(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch, uint64_t flags) { struct mvcc_arg *mvcc_arg = arg->custom; @@ -407,26 +450,26 @@ punchd_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, set_oid(mvcc_arg->i, path, &oid); set_dkey(mvcc_arg->i, path, &dkey); - return tx_punch(arg->ctx.tc_co_hdl, dti, oid, epoch, flags, &dkey, + return tx_punch(arg->ctx.tc_co_hdl, txh, oid, epoch, flags, &dkey, 0 /* akey_nr */, NULL /* akeys */); } static int -punchd_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +punchd_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return punchd_with_flags(arg, dti, path, epoch, 0 /* flags */); + return punchd_with_flags(arg, txh, path, epoch, 0 /* flags */); } static int -punchd_dne_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +punchd_dne_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return punchd_with_flags(arg, dti, path, epoch, VOS_OF_COND_PUNCH); + return punchd_with_flags(arg, txh, path, epoch, VOS_OF_COND_PUNCH); } static int -puncha_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncha_with_flags(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch, uint64_t flags) { struct mvcc_arg *mvcc_arg = arg->custom; @@ -440,22 +483,22 @@ puncha_with_flags(struct io_test_args *arg, struct dtx_id *dti, char *path, set_dkey(mvcc_arg->i, path, &dkey); set_akey(mvcc_arg->i, path, &akey); - return tx_punch(arg->ctx.tc_co_hdl, dti, oid, epoch, flags, &dkey, + return tx_punch(arg->ctx.tc_co_hdl, txh, oid, epoch, flags, &dkey, 1 /* akey_nr */, &akey); } static int -puncha_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncha_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return puncha_with_flags(arg, dti, path, epoch, 0 /* flags */); + return puncha_with_flags(arg, txh, path, epoch, 0 /* flags */); } static int -puncha_ane_f(struct io_test_args *arg, struct dtx_id *dti, char *path, +puncha_ane_f(struct io_test_args *arg, struct tx_helper *txh, char *path, daos_epoch_t epoch) { - return puncha_with_flags(arg, dti, path, epoch, VOS_OF_COND_PUNCH); + return puncha_with_flags(arg, txh, path, epoch, VOS_OF_COND_PUNCH); } static struct op operations[] = { @@ -527,7 +570,12 @@ static struct conflicting_rw_excluded_case conflicting_rw_excluded_cases[] = { {false, "puncha_ane", "coda", "puncho_one", "co", 0, false}, {false, "puncha_ane", "coda", "puncho_one", "co", 1, false}, {false, "puncha_ane", "coda", "punchd_dne", "cod", 0, false}, - {false, "puncha_ane", "coda", "punchd_dne", "cod", 1, false} + {false, "puncha_ane", "coda", "punchd_dne", "cod", 1, false}, + {false, "puncha_ane", "coda", "puncho_one", "co", 0, true}, + {false, "punchd_dne", "cod", "puncho_one", "co", 0, true}, + {false, "puncha_ane", "coda", "update_de", "coda", 0, true}, + {false, "puncha_ane", "coda", "update_dne", "coda", 0, true}, + {false, "puncha_ane", "coda", "punchd_dne", "cod", 0, true}, }; static int64_t @@ -569,32 +617,59 @@ conflicting_rw_exec_one(struct io_test_args *arg, int i, int j, bool empty, struct op *r, char *rp, daos_epoch_t re, struct op *w, char *wp, daos_epoch_t we, bool same_tx) { - struct mvcc_arg *mvcc_arg = arg->custom; - struct dtx_id rtx; - struct dtx_id wtx; - int expected_rrc = 0; - int expected_wrc = 0; - int nfailed = 0; - int rc; + struct mvcc_arg *mvcc_arg = arg->custom; + struct tx_helper *rtx; + struct tx_helper *wtx; + struct tx_helper txh1 = {0}; + struct tx_helper txh2 = {0}; + int expected_rrc = 0; + int expected_wrc = 0; + int nfailed = 0; + int rc; -#if 1 /* FIXME: Many "same TX" cases fail currently. */ - if (same_tx) - goto out; -#endif if (is_excluded(empty, r, rp, re, w, wp, we, same_tx)) goto out; + /* + * Figure out the expected read result, perform read, and verify the + * result. + */ + if (r->o_rtype == R_E && !empty) + expected_rrc = -DER_EXIST; + else if (r->o_rtype == R_NE && empty) + expected_rrc = -DER_NONEXIST; + + if (same_tx && expected_rrc != 0) { + /** Not a valid use case as conditional updates are split in the + * context of distributed transactions. The conditional fetch + * would mean either the update doesn't execute or should abort + * the transaction if it returns -DER_EXIST + */ + goto out; + } + print_message("CASE %d.%d: %s, %s(%s, "DF_U64"), %s(%s, "DF_U64"), " "%s TX [%d]\n", i, j, empty ? "empty" : "nonemtpy", r->o_name, rp, re, w->o_name, wp, we, same_tx ? "same" : "diff", mvcc_arg->i); - /* Generate the TX IDs. */ - daos_dti_gen_unique(&rtx); - if (same_tx) - wtx = rtx; - else - daos_dti_gen_unique(&wtx); + if (same_tx) { + rtx = wtx = &txh1; + txh1.th_nr_ops = 2; + txh1.th_op_seq = 1; + if (is_rw(r)) + txh1.th_nr_mods = 2; + else + txh1.th_nr_mods = 1; + } else { + rtx = &txh1; + wtx = &txh2; + txh1.th_nr_ops = txh2.th_nr_ops = 1; + txh1.th_op_seq = txh2.th_op_seq = 1; + txh2.th_nr_mods = 1; + if (is_rw(r)) + txh1.th_nr_mods = 1; + } /* If requested, prepare the data that will be read. */ if (!empty) { @@ -603,7 +678,7 @@ conflicting_rw_exec_one(struct io_test_args *arg, int i, int j, bool empty, memcpy(pp, rp, strlen(rp)); print_message(" update(%s, "DF_U64") before %s(%s, " DF_U64"): ", pp, re - 1, r->o_name, rp, re); - rc = update_f(arg, NULL /* dti */, pp, re - 1); + rc = update_f(arg, NULL /* txh */, pp, re - 1); print_message("%d\n", rc); if (rc != 0) { nfailed++; @@ -611,17 +686,9 @@ conflicting_rw_exec_one(struct io_test_args *arg, int i, int j, bool empty, } } - /* - * Figure out the expected read result, perform read, and verify the - * result. - */ - if (r->o_rtype == R_E && !empty) - expected_rrc = -DER_EXIST; - else if (r->o_rtype == R_NE && empty) - expected_rrc = -DER_NONEXIST; print_message(" %s(%s, "DF_U64") (expect %d): ", r->o_name, rp, re, expected_rrc); - rc = r->o_func(arg, &rtx, rp, re); + rc = r->o_func(arg, rtx, rp, re); print_message("%d\n", rc); if (rc != expected_rrc) { nfailed++; @@ -655,7 +722,7 @@ conflicting_rw_exec_one(struct io_test_args *arg, int i, int j, bool empty, } print_message(" %s(%s, "DF_U64") (expect %d): ", w->o_name, wp, we, expected_wrc); - rc = w->o_func(arg, &wtx, wp, we); + rc = w->o_func(arg, wtx, wp, we); print_message("%d\n", rc); if (rc != expected_wrc) nfailed++; diff --git a/src/vos/tests/vts_ts.c b/src/vos/tests/vts_ts.c index 9c69a9df1b5..170c40f796a 100644 --- a/src/vos/tests/vts_ts.c +++ b/src/vos/tests/vts_ts.c @@ -734,9 +734,10 @@ ts_test_init(void **state) struct vos_ts_table *ts_table; struct ts_test_arg *ts_arg; int rc; - uuid_t uuid; + struct dtx_id xid = {0}; - uuid_clear(uuid); + uuid_clear(xid.dti_uuid); + xid.dti_hlc = 1; D_ALLOC_PTR(ts_arg); if (ts_arg == NULL) @@ -751,7 +752,7 @@ ts_test_init(void **state) for (i = 0; i < VOS_TS_TYPE_COUNT; i++) ts_arg->ta_counts[i] = ts_table->tt_type_info[i].ti_count; - rc = vos_ts_set_allocate(&ts_arg->ta_ts_set, 0, 1, &uuid); + rc = vos_ts_set_allocate(&ts_arg->ta_ts_set, 0, 0, 1, &xid); if (rc != 0) { D_FREE(ts_arg); return rc; diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 5cf7f84fc48..e4cd3cf27b1 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -124,18 +124,18 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob) } static int -vos_tx_publish(struct dtx_handle *dth, int err) +vos_tx_publish(struct dtx_handle *dth, bool publish) { struct vos_container *cont = vos_hdl2cont(dth->dth_coh); struct dtx_rsrvd_uint *dru; + struct vos_rsrvd_scm *scm; int rc; int i; -again: - for (i = 0, rc = 0; - i < dth->dth_rsrvd_cnt && (rc == 0 || err != 0); i++) { + for (i = 0; i < dth->dth_rsrvd_cnt; i++) { dru = &dth->dth_rsrvds[i]; - rc = vos_publish_scm(cont, dru->dru_scm, err == 0); + rc = vos_publish_scm(cont, dru->dru_scm, publish); + D_FREE(dru->dru_scm); /* FIXME: Currently, vos_publish_blocks() will release * reserved information in 'dru_nvme_list' from @@ -148,20 +148,26 @@ vos_tx_publish(struct dtx_handle *dth, int err) * * It is not fatal, will be handled later. */ - if (rc == 0 || err != 0) - rc = vos_publish_blocks(cont, &dru->dru_nvme, - err == 0, VOS_IOS_GENERIC); - if (err != 0) - D_FREE(dru->dru_scm); + if (rc && publish) + return rc; + + /** Function checks if list is empty */ + rc = vos_publish_blocks(cont, &dru->dru_nvme, + publish, VOS_IOS_GENERIC); + if (rc && publish) + return rc; } - /* Some publish failed, cancel all. */ - if (err == 0 && rc != 0) { - err = rc; - goto again; + for (i = 0; i < dth->dth_deferred_cnt; i++) { + scm = dth->dth_deferred[i]; + rc = vos_publish_scm(cont, scm, publish); + D_FREE(dth->dth_deferred[i]); + + if (rc && publish) + return rc; } - return err; + return 0; } int @@ -183,13 +189,36 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm) } int -vos_tx_end(struct dtx_handle *dth, struct umem_instance *umm, int err) +vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, + struct vos_rsrvd_scm **rsrvd_scmp, d_list_t *nvme_exts, bool started, + int err) { - if (!dtx_is_valid_handle(dth)) - return umem_tx_end(umm, err); + struct dtx_handle *dth = dth_in; + struct dtx_rsrvd_uint *dru; + struct dtx_handle tmp = {0}; + int rc = err; + + if (!dtx_is_valid_handle(dth)) { + /** Created a dummy dth handle for publishing extents */ + dth = &tmp; + tmp.dth_modification_cnt = dth->dth_op_seq = 1; + tmp.dth_local_tx_started = started ? 1 : 0; + tmp.dth_rsrvds = &dth->dth_rsrvd_inline; + tmp.dth_coh = vos_cont2hdl(cont); + } + + if (rsrvd_scmp != NULL) { + D_ASSERT(nvme_exts != NULL); + dru = &dth->dth_rsrvds[dth->dth_rsrvd_cnt++]; + dru->dru_scm = *rsrvd_scmp; + *rsrvd_scmp = NULL; + + D_INIT_LIST_HEAD(&dru->dru_nvme); + d_list_splice_init(nvme_exts, &dru->dru_nvme); + } if (!dth->dth_local_tx_started) - return err; + goto cancel; /* Not the last modification. */ if (err == 0 && dth->dth_modification_cnt > dth->dth_op_seq) @@ -197,32 +226,26 @@ vos_tx_end(struct dtx_handle *dth, struct umem_instance *umm, int err) dth->dth_local_tx_started = 0; - if (err == 0) + if (dtx_is_valid_handle(dth_in) && err == 0) err = vos_dtx_prepared(dth); - err = vos_tx_publish(dth, err); - if (err != 0) { - vos_dtx_cleanup(dth); - - return umem_tx_abort(umm, err); - } - - err = umem_tx_commit(umm); - if (err == 0) { - int i; + if (err == 0) + rc = vos_tx_publish(dth, true); - for (i = 0; i < dth->dth_modification_cnt; i++) { - struct dtx_rsrvd_uint *dru = &dth->dth_rsrvds[i]; + rc = umem_tx_end(vos_cont2umm(cont), rc); - D_FREE(dru->dru_scm); - } - } else { - /* Aborted case. */ - vos_tx_publish(dth, -DER_CANCELED); - vos_dtx_cleanup(dth); +cancel: + if (rc != 0) { + /* The transaction aborted or failed to commit. */ + vos_tx_publish(dth, false); + if (dtx_is_valid_handle(dth_in)) + vos_dtx_cleanup_internal(dth); } - return err; + if (err != 0) + return err; + + return rc; } /** diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 79568a5627f..a8d843b5dd5 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -2188,7 +2188,7 @@ vos_dtx_cmt_reindex(daos_handle_t coh, void *hint) } void -vos_dtx_cleanup(struct dtx_handle *dth) +vos_dtx_cleanup_internal(struct dtx_handle *dth) { struct vos_container *cont; struct vos_dtx_act_ent *dae = NULL; @@ -2213,6 +2213,52 @@ vos_dtx_cleanup(struct dtx_handle *dth) dtx_evict_lid(cont, dae); } } +} + +void +vos_dtx_cleanup(struct dtx_handle *dth) +{ + struct vos_container *cont; + + if (!dtx_is_valid_handle(dth) || !dth->dth_active) + return; + + cont = vos_hdl2cont(dth->dth_coh); + /** This will abort the transaction and callback to + * vos_dtx_cleanup_internal + */ + vos_tx_end(cont, dth, NULL, NULL, true /* don't care */, -DER_CANCELED); +} + +/** Allocate space for saving the vos reservations and deferred actions */ +int +vos_dtx_rsrvd_init(struct dtx_handle *dth) +{ + dth->dth_rsrvd_cnt = 0; + dth->dth_deferred_cnt = 0; + + if (dth->dth_modification_cnt <= 1) { + dth->dth_rsrvds = &dth->dth_rsrvd_inline; + return 0; + } + + D_ALLOC_ARRAY(dth->dth_rsrvds, dth->dth_modification_cnt); + if (dth->dth_rsrvds == NULL) + return -DER_NOMEM; - vos_tx_end(dth, vos_cont2umm(cont), -DER_CANCELED); + D_ALLOC_ARRAY(dth->dth_deferred, dth->dth_modification_cnt); + if (dth->dth_deferred == NULL) { + D_FREE(dth->dth_rsrvds); + return -DER_NOMEM; + } + + return 0; +} + +void +vos_dtx_rsrvd_fini(struct dtx_handle *dth) +{ + D_FREE(dth->dth_deferred); + if (dth->dth_rsrvds != &dth->dth_rsrvd_inline) + D_FREE(dth->dth_rsrvds); } diff --git a/src/vos/vos_ilog.c b/src/vos/vos_ilog.c index 6195a544658..7d865f1f413 100644 --- a/src/vos/vos_ilog.c +++ b/src/vos/vos_ilog.c @@ -546,42 +546,19 @@ vos_ilog_init(void) return 0; } -bool -vos_ilog_ts_lookup(struct vos_ts_set *ts_set, struct ilog_df *ilog) -{ - struct vos_ts_entry *entry; - uint32_t *idx; - - if (ts_set == NULL) - return true; - - idx = ilog_ts_idx_get(ilog); - - return vos_ts_lookup(ts_set, idx, false, &entry); -} - int -vos_ilog_ts_cache(struct vos_ts_set *ts_set, struct ilog_df *ilog, - void *record, daos_size_t rec_size) +vos_ilog_ts_add(struct vos_ts_set *ts_set, struct ilog_df *ilog, + const void *record, daos_size_t rec_size) { - struct vos_ts_entry *entry; - uint32_t *idx; - uint64_t hash; + uint32_t *idx = NULL; if (ts_set == NULL) return 0; - hash = vos_hash_get(record, rec_size); - if (ilog) { + if (ilog != NULL) idx = ilog_ts_idx_get(ilog); - entry = vos_ts_alloc(ts_set, idx, hash); - if (entry == NULL) - return -DER_NO_PERM; - } else { - vos_ts_get_negative(ts_set, hash, false); - } - return 0; + return vos_ts_set_add(ts_set, idx, record, rec_size); } void diff --git a/src/vos/vos_ilog.h b/src/vos/vos_ilog.h index af103e075a2..120f9ae0f76 100644 --- a/src/vos/vos_ilog.h +++ b/src/vos/vos_ilog.h @@ -298,26 +298,14 @@ vos_ilog_aggregate(daos_handle_t coh, struct ilog_df *ilog, * * \param ts_set[in] The timestamp set * \param ilog[in] The incarnation log - * - * \return true if found or ts_set is NULL - */ -bool -vos_ilog_ts_lookup(struct vos_ts_set *ts_set, struct ilog_df *ilog); - -/** Allocate timestamps for the entry and add them to the set. If ilog is - * NULL, it pulls in the negative entry. The hash is calculated using - * vos_hash_get. - * - * \param ts_set[in] The timestamp set - * \param ilog[in] The incarnation log * \param record[in] The record to hash - * \param rec_size[in] The size of the record to hash + * \param rec_size[in] The size of the record * - * \return 0 on success or an error + * \return true if found or ts_set is NULL */ int -vos_ilog_ts_cache(struct vos_ts_set *ts_set, struct ilog_df *ilog, - void *record, daos_size_t rec_size); +vos_ilog_ts_add(struct vos_ts_set *ts_set, struct ilog_df *ilog, + const void *record, daos_size_t rec_size); /** Mark the last timestamp entry corresponding to the ilog as newly created * \param ts_set[in] The timestamp set diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 534fc60932d..defa529619a 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -112,8 +112,6 @@ enum { DTX_LID_RESERVED, }; -/** hash seed for murmur hash */ -#define VOS_BTR_MUR_SEED 0xC0FFEE /* * When aggregate merge window reaches this size threshold, it will stop * growing and trigger window flush immediately. @@ -398,6 +396,10 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df); int vos_dtx_table_register(void); +/** Cleanup the dtx handle when aborting a transaction. */ +void +vos_dtx_cleanup_internal(struct dtx_handle *dth); + /** * Check whether the record (to be accessible) is available to outside or not. * @@ -934,11 +936,34 @@ void vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh); +/* Reserve SCM through umem_reserve() for a PMDK transaction */ +struct vos_rsrvd_scm { + unsigned int rs_actv_cnt; + unsigned int rs_actv_at; + struct pobj_action rs_actv[0]; +}; + int vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm); +/** Finish the transaction and publish or cancel the reservations or + * return if err == 0 and it's a multi-modification transaction that + * isn't complete. + * + * \param[in] cont the VOS container + * \param[in] dth_in The dtx handle, if applicable + * \param[in] rsrvd_scmp Pointer to reserved scm, will be consumed + * \param[in] nvme_exts List of resreved nvme extents + * \param[in] started Only applies when dth_in is invalid, + * indicates if vos_tx_begin was successful + * \param[in] err the error code + * + * \return err if non-zero, otherwise 0 or appropriate error + */ int -vos_tx_end(struct dtx_handle *dth, struct umem_instance *umm, int err); +vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, + struct vos_rsrvd_scm **rsrvd_scmp, d_list_t *nvme_exts, bool started, + int err); /* vos_obj.c */ int @@ -978,13 +1003,6 @@ vos_dedup_fini(struct vos_pool *pool); void vos_dedup_invalidate(struct vos_pool *pool); -/* Reserve SCM through umem_reserve() for a PMDK transaction */ -struct vos_rsrvd_scm { - unsigned int rs_actv_cnt; - unsigned int rs_actv_at; - struct pobj_action rs_actv[0]; -}; - umem_off_t vos_reserve_scm(struct vos_container *cont, struct vos_rsrvd_scm *rsrvd_scm, daos_size_t size); @@ -1040,15 +1058,6 @@ void gc_reserve_space(daos_size_t *rsrvd); -static inline uint64_t -vos_hash_get(void *buf, uint64_t len) -{ - if (buf == NULL) - return vos_kh_get(); - - return d_hash_murmur64(buf, len, VOS_BTR_MUR_SEED); -} - /** * Aggregate the creation/punch records in the current entry of the object * iterator @@ -1139,4 +1148,5 @@ vos_epc_punched(daos_epoch_t epc, uint16_t minor_epc, return false; } + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index ba4fa09b13c..fe8d5eff408 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -70,8 +70,6 @@ struct vos_io_context { daos_size_t ic_space_held[DAOS_MEDIA_MAX]; /** number DAOS IO descriptors */ unsigned int ic_iod_nr; - /** IO had a read conflict */ - bool ic_read_conflict; /** deduplication threshold size */ uint32_t ic_dedup_th; /** dedup entries to be inserted after transaction done */ @@ -389,11 +387,12 @@ vos_ioc_reserve_fini(struct vos_io_context *ioc) } static int -vos_ioc_reserve_init(struct vos_io_context *ioc) +vos_ioc_reserve_init(struct vos_io_context *ioc, struct dtx_handle *dth) { - size_t size; - int total_acts = 0; - int i; + struct vos_rsrvd_scm *scm; + size_t size; + int total_acts = 0; + int i; if (!ioc->ic_update) return 0; @@ -418,6 +417,20 @@ vos_ioc_reserve_init(struct vos_io_context *ioc) return -DER_NOMEM; ioc->ic_rsrvd_scm->rs_actv_cnt = total_acts; + + if (!dtx_is_valid_handle(dth) || dth->dth_deferred == NULL) + return 0; + + /** Reserve enough space for any deferred actions */ + D_ALLOC(scm, size); + if (scm == NULL) { + D_FREE(ioc->ic_rsrvd_scm); + return -DER_NOMEM; + } + + scm->rs_actv_cnt = total_acts; + dth->dth_deferred[dth->dth_deferred_cnt++] = scm; + return 0; } @@ -449,10 +462,11 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only, bool dedup, uint32_t dedup_th, struct dtx_handle *dth, struct vos_io_context **ioc_pp) { - struct vos_container *cont; - struct vos_io_context *ioc = NULL; - struct bio_io_context *bioc; - int i, rc; + struct vos_container *cont; + struct vos_io_context *ioc = NULL; + struct bio_io_context *bioc; + uint64_t cflags = 0; + int i, rc; if (iod_nr == 0) { D_ERROR("Invalid iod_nr (0).\n"); @@ -481,7 +495,6 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only, ((fetch_flags & VOS_FETCH_CHECK_EXISTENCE) != 0); ioc->ic_remove = ((cond_flags & VOS_OF_REMOVE) != 0); - ioc->ic_read_conflict = false; ioc->ic_umoffs_cnt = ioc->ic_umoffs_at = 0; ioc->iod_csums = iod_csums; vos_ilog_fetch_init(&ioc->ic_dkey_info); @@ -490,13 +503,27 @@ vos_ioc_create(daos_handle_t coh, daos_unit_oid_t oid, bool read_only, ioc->ic_shadows = shadows; D_INIT_LIST_HEAD(&ioc->ic_dedup_entries); - rc = vos_ioc_reserve_init(ioc); + rc = vos_ioc_reserve_init(ioc, dth); if (rc != 0) goto error; - rc = vos_ts_set_allocate(&ioc->ic_ts_set, cond_flags, iod_nr, + if (dtx_is_valid_handle(dth)) { + if (read_only) { + cflags = VOS_TS_READ_AKEY; + if (cond_flags & VOS_OF_COND_DKEY_FETCH) + cflags |= VOS_TS_READ_DKEY; + } else { + cflags = VOS_TS_WRITE_AKEY; + if (cond_flags & VOS_COND_AKEY_UPDATE_MASK) + cflags |= VOS_TS_READ_AKEY; + if (cond_flags & VOS_COND_DKEY_UPDATE_MASK) + cflags |= VOS_TS_READ_DKEY; + } + } + + rc = vos_ts_set_allocate(&ioc->ic_ts_set, cond_flags, cflags, iod_nr, dtx_is_valid_handle(dth) ? - &dth->dth_xid.dti_uuid : NULL); + &dth->dth_xid : NULL); if (rc != 0) goto error; @@ -1174,58 +1201,6 @@ vos_fetch_end(daos_handle_t ioh, int err) return err; } -static void -update_ts_on_fetch(struct vos_io_context *ioc, int err) -{ - struct vos_ts_set *ts_set = ioc->ic_ts_set; - struct vos_ts_entry *entry; - struct vos_ts_entry *prev; - int akey_idx; - - if (ts_set == NULL) - return; - - /** Aborted for another reason, no timestamp updates */ - if (err != 0 && err != -DER_NONEXIST) - return; - - /** Fetch is always a read of the value so always update the - * both akey timestamps regardless - */ - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_CONT, 0); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_OBJ, 0); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - prev = entry; - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_DKEY, 0); - - if (entry == NULL) - goto update_prev; - if (ts_set->ts_flags & VOS_OF_COND_DKEY_FETCH) - vos_ts_rl_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - - if (entry == prev) - return; - - prev = entry; - - for (akey_idx = 0; akey_idx < ioc->ic_iod_nr; akey_idx++) { - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_AKEY, - akey_idx); - if (entry == NULL) - goto update_prev; - - vos_ts_rl_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - } - - return; - -update_prev: - vos_ts_rl_update(prev, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); -} - int vos_fetch_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, uint64_t cond_flags, daos_key_t *dkey, unsigned int iod_nr, @@ -1234,7 +1209,6 @@ vos_fetch_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, struct dtx_handle *dth) { struct vos_io_context *ioc; - struct vos_ts_entry *entry; int i, rc; D_DEBUG(DB_TRACE, "Fetch "DF_UOID", desc_nr %d, epoch "DF_X64"\n", @@ -1248,12 +1222,8 @@ vos_fetch_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, vos_dth_set(dth); - if (!vos_ts_lookup(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, false, - &entry)) { - /** Re-cache the container timestamps */ - entry = vos_ts_alloc(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, - 0); - } + rc = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); + D_ASSERT(rc == 0); rc = vos_obj_hold(vos_obj_cache_current(), ioc->ic_cont, oid, &ioc->ic_epr, true, DAOS_INTENT_DEFAULT, true, @@ -1288,7 +1258,8 @@ vos_fetch_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, out: vos_dth_set(NULL); - update_ts_on_fetch(ioc, rc); + if (rc == -DER_NONEXIST || rc == 0) + vos_ts_set_update(ioc->ic_ts_set, ioc->ic_epr.epr_hi); if (rc != 0) { daos_recx_ep_list_free(ioc->ic_recx_lists, ioc->ic_iod_nr); @@ -1452,9 +1423,6 @@ akey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_handle_t ak_toh, if (rc != 0) return rc; - if (vos_ts_check_rh_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi)) - ioc->ic_read_conflict = true; - if (ioc->ic_ts_set) { switch (ioc->ic_ts_set->ts_flags & VOS_COND_AKEY_UPDATE_MASK) { case VOS_OF_COND_AKEY_UPDATE: @@ -1547,9 +1515,6 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, } subtr_created = true; - if (vos_ts_check_rl_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi)) - ioc->ic_read_conflict = true; - if (ioc->ic_ts_set) { if (ioc->ic_ts_set->ts_flags & VOS_COND_UPDATE_OP_MASK) update_cond = VOS_ILOG_COND_UPDATE; @@ -1936,9 +1901,10 @@ update_cancel(struct vos_io_context *ioc) { /* Cancel SCM reservations or free persistent allocations */ - if (vos_cont2umm(ioc->ic_cont)->umm_ops->mo_reserve != NULL) { - vos_publish_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, false); - } else if (ioc->ic_umoffs_cnt != 0) { + if (vos_cont2umm(ioc->ic_cont)->umm_ops->mo_reserve != NULL) + return; + + if (ioc->ic_umoffs_cnt != 0) { struct umem_instance *umem = vos_ioc2umm(ioc); int i; @@ -1951,71 +1917,11 @@ update_cancel(struct vos_io_context *ioc) } } - /* Cancel NVMe reservations */ - vos_publish_blocks(ioc->ic_cont, &ioc->ic_blk_exts, false, - VOS_IOS_GENERIC); - /* Abort dedup entries */ vos_dedup_process(vos_cont2pool(ioc->ic_cont), &ioc->ic_dedup_entries, true /* abort */); } -static void -update_ts_on_update(struct vos_io_context *ioc, int err) -{ - struct vos_ts_set *ts_set = ioc->ic_ts_set; - struct vos_ts_entry *entry; - struct vos_ts_entry *centry; - int akey_idx; - - if (ts_set == NULL) - return; - - /** No conditional flags, so no timestamp updates */ - if ((ts_set->ts_flags & VOS_COND_UPDATE_MASK) == 0) - return; - - /** Aborted for another reason, no timestamp updates */ - if (err != 0 && err != -DER_NONEXIST && err != -DER_EXIST) - return; - - if (err == 0) { - /** the update succeeded so any negative entries used for - * checks should be changed to positive entries - */ - vos_ts_set_upgrade(ts_set); - } - - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_CONT, 0); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_OBJ, 0); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - centry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_DKEY, 0); - if (centry == NULL) { - vos_ts_rl_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - return; - } - entry = centry; - if (ts_set->ts_flags & VOS_COND_DKEY_UPDATE_MASK) - vos_ts_rl_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - vos_ts_rh_update(entry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - - if ((ts_set->ts_flags & VOS_COND_AKEY_UPDATE_MASK) == 0) - return; - - for (akey_idx = 0; akey_idx < ioc->ic_iod_nr; akey_idx++) { - centry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_AKEY, - akey_idx); - if (centry == NULL) { - vos_ts_rl_update(entry, ioc->ic_epr.epr_hi, - ts_set->ts_tx_id); - continue; - } - vos_ts_rl_update(centry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - vos_ts_rh_update(centry, ioc->ic_epr.epr_hi, ts_set->ts_tx_id); - } -} - int vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, struct dtx_handle *dth) @@ -2023,30 +1929,25 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, struct vos_dtx_act_ent **daes = NULL; struct vos_io_context *ioc = vos_ioh2ioc(ioh); struct umem_instance *umem; - struct vos_ts_entry *entry; - uint64_t time = 0; + uint64_t time = 0; + bool tx_started = false; VOS_TIME_START(time, VOS_UPDATE_END); D_ASSERT(ioc->ic_update); if (err != 0) - goto out; - - if (!vos_ts_lookup(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, false, - &entry)) { - /** Re-cache the container timestamps */ - entry = vos_ts_alloc(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, - 0); - } + goto abort; - if (vos_ts_check_rl_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi)) - ioc->ic_read_conflict = true; + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); + D_ASSERT(err == 0); umem = vos_ioc2umm(ioc); err = vos_tx_begin(dth, umem); if (err != 0) - goto out; + goto abort; + + tx_started = true; vos_dth_set(dth); @@ -2069,10 +1970,6 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - /** Check object timestamp */ - if (vos_ts_check_rl_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi)) - ioc->ic_read_conflict = true; - /* Update tree index */ err = dkey_update(ioc, pm_ver, dkey, dtx_is_valid_handle(dth) ? dth->dth_op_seq : VOS_MINOR_EPC_MAX); @@ -2085,47 +1982,32 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, /** Now that we are past the existence checks, ensure there isn't a * read conflict */ - if (ioc->ic_read_conflict) { + if (vos_ts_set_check_conflict(ioc->ic_ts_set, ioc->ic_epr.epr_hi)) { err = -DER_TX_RESTART; goto abort; } - if (dtx_is_valid_handle(dth)) { - struct dtx_rsrvd_uint *dru; - - dru = &dth->dth_rsrvds[dth->dth_rsrvd_cnt++]; - dru->dru_scm = ioc->ic_rsrvd_scm; - ioc->ic_rsrvd_scm = NULL; - - D_INIT_LIST_HEAD(&dru->dru_nvme); - d_list_splice_init(&ioc->ic_blk_exts, &dru->dru_nvme); - } else { - /* Publish SCM reservations */ - err = vos_publish_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, true); - if (err == 0) - /* Publish NVMe reservations */ - err = vos_publish_blocks(ioc->ic_cont, - &ioc->ic_blk_exts, true, - VOS_IOS_GENERIC); - } - abort: - err = vos_tx_end(dth, umem, err); + err = vos_tx_end(ioc->ic_cont, dth, &ioc->ic_rsrvd_scm, + &ioc->ic_blk_exts, tx_started, err); -out: - if (err != 0) { - update_cancel(ioc); - } else { + if (err == 0) { if (daes != NULL) vos_dtx_post_handle(ioc->ic_cont, daes, dth->dth_dti_cos_count, false); vos_dedup_process(vos_cont2pool(ioc->ic_cont), &ioc->ic_dedup_entries, false); + } else { + update_cancel(ioc); } D_FREE(daes); - update_ts_on_update(ioc, err); + if (err == 0) + vos_ts_set_upgrade(ioc->ic_ts_set); + + if (err == -DER_NONEXIST || err == -DER_EXIST || err == 0) + vos_ts_set_update(ioc->ic_ts_set, ioc->ic_epr.epr_hi); VOS_TIME_END(time, VOS_UPDATE_END); vos_space_unhold(vos_cont2pool(ioc->ic_cont), &ioc->ic_space_held[0]); diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 68f1d057422..937a84139bf 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -58,7 +58,6 @@ key_punch(struct vos_object *obj, daos_epoch_t epoch, uint32_t pm_ver, struct vos_ilog_info akey_info = {0}; daos_epoch_range_t epr = {0, epoch}; d_iov_t riov; - bool read_conflict = false; int rc; if (flags & VOS_OF_COND_PUNCH) { @@ -103,9 +102,6 @@ key_punch(struct vos_object *obj, daos_epoch_t epoch, uint32_t pm_ver, D_GOTO(out, rc); } - if (vos_ts_check_rl_conflict(ts_set, epoch)) - read_conflict = true; - rc = vos_ilog_punch(obj->obj_cont, &krec->kr_ilog, &epr, &obj_info, &dkey_info, ts_set, false, false); @@ -138,9 +134,6 @@ key_punch(struct vos_object *obj, daos_epoch_t epoch, uint32_t pm_ver, vos_ilog_fetch_finish(&akey_info); } - if (rc == 0 && read_conflict) - rc = -DER_TX_RESTART; - return rc; } @@ -169,57 +162,6 @@ obj_punch(daos_handle_t coh, struct vos_object *obj, daos_epoch_t epoch, return rc; } -static void -update_read_timestamps(struct vos_ts_set *ts_set, daos_epoch_t epoch, - int akey_nr, int err) -{ - struct vos_ts_entry *entry; - int akey_idx; - - if (ts_set == NULL) - return; - - /** No conditional flags, so no timestamp updates */ - if ((ts_set->ts_flags & VOS_OF_COND_PUNCH) == 0) - return; - - /** Aborted for another reason, no timestamp updates */ - if (err != 0 && err != -DER_NONEXIST) - return; - - if (err == 0) { - /** the update succeeded so any negative entries used for - * checks should be changed to positive entries - */ - vos_ts_set_upgrade(ts_set); - } - - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_CONT, 0); - vos_ts_rh_update(entry, epoch, ts_set->ts_tx_id); - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_OBJ, 0); - vos_ts_rh_update(entry, epoch, ts_set->ts_tx_id); - - if (ts_set->ts_init_count == 2) { - vos_ts_rl_update(entry, epoch, ts_set->ts_tx_id); - return; - } - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_DKEY, 0); - vos_ts_rh_update(entry, epoch, ts_set->ts_tx_id); - if (ts_set->ts_init_count == 3) { - vos_ts_rl_update(entry, epoch, ts_set->ts_tx_id); - return; - } - for (akey_idx = 0; akey_idx < akey_nr; akey_idx++) { - entry = vos_ts_set_get_entry_type(ts_set, VOS_TS_TYPE_AKEY, - akey_idx); - if (entry == NULL) - return; - vos_ts_rl_update(entry, epoch, ts_set->ts_tx_id); - vos_ts_rh_update(entry, epoch, ts_set->ts_tx_id); - } -} - - /** * Punch an object, or punch a dkey, or punch an array of akeys. */ @@ -229,13 +171,12 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, unsigned int akey_nr, daos_key_t *akeys, struct dtx_handle *dth) { struct vos_dtx_act_ent **daes = NULL; - struct vos_ts_entry *entry; struct vos_ts_set *ts_set; struct vos_container *cont; struct vos_object *obj = NULL; daos_epoch_range_t epr = { 0 }; - bool read_conflict = false; int rc = 0; + uint64_t cflags = 0; if (dtx_is_valid_handle(dth)) epr.epr_hi = dth->dth_epoch; @@ -248,19 +189,32 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, vos_dth_set(dth); cont = vos_hdl2cont(coh); - rc = vos_ts_set_allocate(&ts_set, flags, akey_nr, + if (dtx_is_valid_handle(dth)) { + if (akey_nr) { + cflags = VOS_TS_WRITE_AKEY; + if (flags & VOS_OF_COND_PUNCH) + cflags |= VOS_TS_READ_AKEY; + } else if (dkey != NULL) { + cflags = VOS_TS_WRITE_DKEY; + if (flags & VOS_OF_COND_PUNCH) + cflags |= VOS_TS_READ_DKEY; + } else { + cflags = VOS_TS_WRITE_OBJ; + if (flags & VOS_OF_COND_PUNCH) + cflags |= VOS_TS_READ_OBJ; + } + + } + + rc = vos_ts_set_allocate(&ts_set, flags, cflags, akey_nr, dtx_is_valid_handle(dth) ? - &dth->dth_xid.dti_uuid : NULL); + &dth->dth_xid : NULL); if (rc != 0) goto reset; - if (!vos_ts_lookup(ts_set, cont->vc_ts_idx, false, &entry)) { - /** Re-cache the container timestamps */ - entry = vos_ts_alloc(ts_set, cont->vc_ts_idx, 0); - } - - if (vos_ts_check_rl_conflict(ts_set, epr.epr_hi)) - read_conflict = true; + rc = vos_ts_set_add(ts_set, cont->vc_ts_idx, NULL, 0); + if (rc != 0) + goto reset; rc = vos_tx_begin(dth, vos_cont2umm(cont)); if (rc != 0) @@ -284,9 +238,6 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, false, DAOS_INTENT_PUNCH, true, &obj, ts_set); if (rc == 0) { if (dkey) { /* key punch */ - if (vos_ts_check_rl_conflict(ts_set, epr.epr_hi)) - read_conflict = true; - rc = key_punch(obj, epr.epr_hi, pm_ver, dkey, akey_nr, akeys, flags, ts_set); } else { /* object punch */ @@ -294,11 +245,8 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } } - if (rc == 0 && read_conflict) - rc = -DER_TX_RESTART; - abort: - rc = vos_tx_end(dth, vos_cont2umm(cont), rc); + rc = vos_tx_end(cont, dth, NULL, NULL, true, rc); if (obj != NULL) vos_obj_release(vos_obj_cache_current(), obj, rc != 0); @@ -314,7 +262,12 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, vos_dth_set(NULL); - update_read_timestamps(ts_set, epr.epr_hi, akey_nr, rc); + if (rc == 0) + vos_ts_set_upgrade(ts_set); + + if (rc == -DER_NONEXIST || rc == 0) + vos_ts_set_update(ts_set, epr.epr_hi); + vos_ts_set_free(ts_set); return rc; diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index be7633ca2eb..a889cbf118b 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -224,8 +224,8 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, struct daos_llink *lret; struct obj_lru_key lkey; int rc = 0; + int tmprc; uint32_t cond_mask = 0; - bool found; D_ASSERT(cont != NULL); D_ASSERT(cont->vc_pool); @@ -266,14 +266,9 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, if (obj->obj_df) { D_DEBUG(DB_TRACE, "looking up object ilog"); - found = vos_ilog_ts_lookup(ts_set, &obj->obj_df->vo_ilog); - if (!found) { - int tmprc; - - tmprc = vos_ilog_ts_cache(ts_set, &obj->obj_df->vo_ilog, - &oid, sizeof(oid)); - D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ - } + tmprc = vos_ilog_ts_add(ts_set, &obj->obj_df->vo_ilog, &oid, + sizeof(oid)); + D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ goto check_object; } diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index bb29ccff658..b367f86da29 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -206,7 +206,6 @@ vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid, d_iov_t val_iov; int rc; int tmprc; - bool found = false; *obj_p = NULL; d_iov_set(&key_iov, &oid, sizeof(oid)); @@ -220,16 +219,12 @@ vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid, D_ASSERT(daos_unit_obj_id_equal(obj->vo_id, oid)); *obj_p = obj; ilog = &obj->vo_ilog; - - found = vos_ilog_ts_lookup(ts_set, ilog); - if (found) - goto out; } - tmprc = vos_ilog_ts_cache(ts_set, ilog, &oid, sizeof(oid)); + tmprc = vos_ilog_ts_add(ts_set, ilog, &oid, sizeof(oid)); D_ASSERT(tmprc == 0); /* Non-zero return for akey only */ -out: + return rc; } @@ -312,7 +307,7 @@ vos_oi_punch(struct vos_container *cont, daos_unit_oid_t oid, info, ts_set, true, (flags & VOS_OF_REPLAY_PC) != 0); - if (rc == 0 && vos_ts_check_rh_conflict(ts_set, epoch)) + if (rc == 0 && vos_ts_set_check_conflict(ts_set, epoch)) rc = -DER_TX_RESTART; VOS_TX_LOG_FAIL(rc, "Failed to update incarnation log entry: "DF_RC"\n", diff --git a/src/vos/vos_tls.h b/src/vos/vos_tls.h index b350e387d30..225a57b9f65 100644 --- a/src/vos/vos_tls.h +++ b/src/vos/vos_tls.h @@ -143,4 +143,16 @@ vos_kh_get(void) return vos_tls_get()->vtl_kh; } +/** hash seed for murmur hash */ +#define VOS_BTR_MUR_SEED 0xC0FFEE + +static inline uint64_t +vos_hash_get(const void *buf, uint64_t len) +{ + if (buf == NULL) + return vos_kh_get(); + + return d_hash_murmur64(buf, len, VOS_BTR_MUR_SEED); +} + #endif /* __VOS_TLS_H__ */ diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index 77bf5c4928e..2611487bfac 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -592,18 +592,30 @@ svt_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, } static int -svt_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) +svt_rec_free_internal(struct btr_instance *tins, struct btr_record *rec, + bool overwrite) { daos_epoch_t *epc = (daos_epoch_t *)&rec->rec_hkey[0]; struct vos_irec_df *irec = vos_rec2irec(tins, rec); bio_addr_t *addr = &irec->ir_ex_addr; + struct dtx_handle *dth = NULL; + struct vos_rsrvd_scm *rsrvd_scm; + struct pobj_action *act; + int i; if (UMOFF_IS_NULL(rec->rec_off)) return 0; + if (overwrite) { + dth = vos_dth_get(); + if (dth == NULL) + return -DER_NO_PERM; /* Not allowed */ + } + vos_dtx_deregister_record(&tins->ti_umm, tins->ti_coh, irec->ir_dtx, *epc, rec->rec_off); + /** TODO: handle NVME */ /* SCM value is stored together with vos_irec_df */ if (addr->ba_type == DAOS_MEDIA_NVME) { struct vos_pool *pool = tins->ti_priv; @@ -612,7 +624,34 @@ svt_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) vos_bio_addr_free(pool, addr, irec->ir_size); } - return umem_free(&tins->ti_umm, rec->rec_off); + if (!overwrite) + return umem_free(&tins->ti_umm, rec->rec_off); + + /** Find an empty slot for the deferred free */ + for (i = 0; i < dth->dth_deferred_cnt; i++) { + rsrvd_scm = dth->dth_deferred[i]; + D_ASSERT(rsrvd_scm != NULL); + + if (rsrvd_scm->rs_actv_at >= rsrvd_scm->rs_actv_cnt) + continue; /* Can't really be > but keep it simple */ + + act = &rsrvd_scm->rs_actv[rsrvd_scm->rs_actv_at]; + umem_defer_free(&tins->ti_umm, rec->rec_off, act); + rsrvd_scm->rs_actv_at++; + + return 0; + } + + /** We didn't reserve enough space */ + D_ASSERT(0); + + return -DER_MISC; +} + +static int +svt_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) +{ + return svt_rec_free_internal(tins, rec, false); } static int @@ -650,7 +689,7 @@ svt_rec_update(struct btr_instance *tins, struct btr_record *rec, D_DEBUG(DB_IO, "Overwrite epoch "DF_X64".%d\n", skey->sk_epoch, skey->sk_minor_epc); - rc = svt_rec_free(tins, rec, NULL); + rc = svt_rec_free_internal(tins, rec, true); if (rc != 0) return rc; @@ -898,7 +937,6 @@ key_tree_prepare(struct vos_object *obj, daos_handle_t toh, struct dcs_csum_info csum; struct vos_rec_bundle rbund; d_iov_t riov; - bool found; int rc; int tmprc; @@ -936,15 +974,12 @@ key_tree_prepare(struct vos_object *obj, daos_handle_t toh, case 0: krec = rbund.rb_krec; ilog = &krec->kr_ilog; - found = vos_ilog_ts_lookup(ts_set, ilog); - if (found) - break; /** fall through to cache re-cache entry */ case -DER_NONEXIST: /** Key hash already be calculated by dbtree_fetch so no need * to pass in the key here. */ - tmprc = vos_ilog_ts_cache(ts_set, ilog, NULL, 0); + tmprc = vos_ilog_ts_add(ts_set, ilog, NULL, 0); if (tmprc != 0) { rc = tmprc; D_ASSERT(tmprc == -DER_NO_PERM); @@ -1008,13 +1043,13 @@ key_tree_punch(struct vos_object *obj, daos_handle_t toh, daos_epoch_t epoch, struct vos_ts_set *ts_set, struct vos_ilog_info *parent, struct vos_ilog_info *info) { - struct vos_rec_bundle *rbund; + struct vos_rec_bundle *rbund = iov2rec_bundle(val_iov); struct vos_krec_df *krec; struct ilog_df *ilog = NULL; daos_epoch_range_t epr = {0, epoch}; - bool found = false; bool mark = false; int rc; + int lrc; rc = dbtree_fetch(toh, BTR_PROBE_EQ, DAOS_INTENT_UPDATE, key_iov, NULL, val_iov); @@ -1024,11 +1059,13 @@ key_tree_punch(struct vos_object *obj, daos_handle_t toh, daos_epoch_t epoch, rbund = iov2rec_bundle(val_iov); krec = rbund->rb_krec; ilog = &krec->kr_ilog; - found = vos_ilog_ts_lookup(ts_set, ilog); } - if (!found) - vos_ilog_ts_cache(ts_set, ilog, NULL, 0); + lrc = vos_ilog_ts_add(ts_set, ilog, NULL, 0); + if (lrc != 0) { + rc = lrc; + goto done; + } if (rc == -DER_NONEXIST && (flags & VOS_OF_COND_PUNCH)) { rc = -DER_NONEXIST; @@ -1050,15 +1087,16 @@ key_tree_punch(struct vos_object *obj, daos_handle_t toh, daos_epoch_t epoch, /** Punch always adds a log entry */ rbund = iov2rec_bundle(val_iov); krec = rbund->rb_krec; + ilog = &krec->kr_ilog; if (mark) vos_ilog_ts_mark(ts_set, ilog); - rc = vos_ilog_punch(obj->obj_cont, &krec->kr_ilog, &epr, parent, + rc = vos_ilog_punch(obj->obj_cont, ilog, &epr, parent, info, ts_set, true, (flags & VOS_OF_REPLAY_PC) != 0); - if (rc == 0 && vos_ts_check_rh_conflict(ts_set, epoch)) + if (rc == 0 && vos_ts_set_check_conflict(ts_set, epoch)) rc = -DER_TX_RESTART; done: VOS_TX_LOG_FAIL(rc, "Failed to punch key: "DF_RC"\n", DP_RC(rc)); diff --git a/src/vos/vos_ts.c b/src/vos/vos_ts.c index 4d6e224bac7..fdd152724a9 100644 --- a/src/vos/vos_ts.c +++ b/src/vos/vos_ts.c @@ -49,8 +49,8 @@ static const uint32_t type_counts[] = { #define TS_TRACE(action, entry, idx, type) \ D_DEBUG(DB_TRACE, "%s %s at idx %d(%p), read.hi="DF_U64 \ " read.lo="DF_U64"\n", action, type_strs[type], idx, \ - (entry)->te_record_ptr, (entry)->te_ts_rh, \ - (entry)->te_ts_rl) + (entry)->te_record_ptr, (entry)->te_ts.tp_ts_rh, \ + (entry)->te_ts.tp_ts_rl) /** This probably needs more thought */ static bool @@ -84,19 +84,21 @@ ts_update_on_evict(struct vos_ts_table *ts_table, struct vos_ts_entry *entry) } if (other == NULL) { - if (entry->te_ts_rl > ts_table->tt_ts_rl) { - ts_table->tt_ts_rl = entry->te_ts_rl; - uuid_copy(ts_table->tt_tx_rl, entry->te_tx_rl); + if (entry->te_ts.tp_ts_rl > ts_table->tt_ts_rl) { + vos_ts_copy(&ts_table->tt_ts_rl, &ts_table->tt_tx_rl, + entry->te_ts.tp_ts_rl, + &entry->te_ts.tp_tx_rl); } - if (entry->te_ts_rh > ts_table->tt_ts_rh) { - ts_table->tt_ts_rh = entry->te_ts_rh; - uuid_copy(ts_table->tt_tx_rh, entry->te_tx_rh); + if (entry->te_ts.tp_ts_rh > ts_table->tt_ts_rh) { + vos_ts_copy(&ts_table->tt_ts_rh, &ts_table->tt_tx_rh, + entry->te_ts.tp_ts_rh, + &entry->te_ts.tp_tx_rh); } return true; } - vos_ts_rl_update(other, entry->te_ts_rl, entry->te_tx_rl); - vos_ts_rh_update(other, entry->te_ts_rh, entry->te_tx_rh); + vos_ts_rl_update(other, entry->te_ts.tp_ts_rl, &entry->te_ts.tp_tx_rl); + vos_ts_rh_update(other, entry->te_ts.tp_ts_rh, &entry->te_ts.tp_tx_rh); return true; } @@ -184,8 +186,8 @@ vos_ts_table_alloc(struct vos_ts_table **ts_tablep) ts_table->tt_ts_rl = vos_start_epoch; ts_table->tt_ts_rh = vos_start_epoch; - uuid_clear(ts_table->tt_tx_rl); - uuid_clear(ts_table->tt_tx_rh); + uuid_clear(ts_table->tt_tx_rl.dti_uuid); + uuid_clear(ts_table->tt_tx_rh.dti_uuid); miss_cursor = ts_table->tt_misses; for (i = 0; i < VOS_TS_TYPE_COUNT; i++) { info = &ts_table->tt_type_info[i]; @@ -266,10 +268,10 @@ vos_ts_evict_lru(struct vos_ts_table *ts_table, struct vos_ts_entry *parent, if (parent == NULL) { /** Use global timestamps for the type to initialize it */ - entry->te_ts_rl = ts_table->tt_ts_rl; - entry->te_ts_rh = ts_table->tt_ts_rh; - uuid_copy(entry->te_tx_rl, ts_table->tt_tx_rl); - uuid_copy(entry->te_tx_rh, ts_table->tt_tx_rh); + vos_ts_copy(&entry->te_ts.tp_ts_rl, &entry->te_ts.tp_tx_rl, + ts_table->tt_ts_rl, &ts_table->tt_tx_rl); + vos_ts_copy(&entry->te_ts.tp_ts_rh, &entry->te_ts.tp_tx_rh, + ts_table->tt_ts_rh, &ts_table->tt_tx_rh); entry->te_parent_ptr = NULL; } else { entry->te_parent_ptr = parent->te_record_ptr; @@ -281,17 +283,17 @@ vos_ts_evict_lru(struct vos_ts_table *ts_table, struct vos_ts_entry *parent, if (ts_source == NULL) /* for negative and uncached entries */ ts_source = parent; - entry->te_ts_rl = ts_source->te_ts_rl; - entry->te_ts_rh = ts_source->te_ts_rh; - uuid_copy(entry->te_tx_rl, ts_source->te_tx_rl); - uuid_copy(entry->te_tx_rh, ts_source->te_tx_rh); + vos_ts_copy(&entry->te_ts.tp_ts_rl, &entry->te_ts.tp_tx_rl, + ts_source->te_ts.tp_ts_rl, + &ts_source->te_ts.tp_tx_rl); + vos_ts_copy(&entry->te_ts.tp_ts_rh, &entry->te_ts.tp_tx_rh, + ts_source->te_ts.tp_ts_rh, + &ts_source->te_ts.tp_tx_rh); } /** Set the lower bounds for the entry */ entry->te_hash_idx = hash_idx; entry->te_record_ptr = idx; - uuid_clear(entry->te_tx_rl); - uuid_clear(entry->te_tx_rh); TS_TRACE("Allocated", entry, *idx, type); D_ASSERT(type == info->ti_type); @@ -301,7 +303,8 @@ vos_ts_evict_lru(struct vos_ts_table *ts_table, struct vos_ts_entry *parent, int vos_ts_set_allocate(struct vos_ts_set **ts_set, uint64_t flags, - uint32_t akey_nr, uuid_t *tx_id) + uint32_t cflags, uint32_t akey_nr, + const struct dtx_id *tx_id) { uint32_t size; uint64_t array_size; @@ -318,8 +321,24 @@ vos_ts_set_allocate(struct vos_ts_set **ts_set, uint64_t flags, return -DER_NOMEM; (*ts_set)->ts_flags = flags; + (*ts_set)->ts_cflags = cflags; + switch (cflags & VOS_TS_WRITE_MASK) { + case VOS_TS_WRITE_OBJ: + (*ts_set)->ts_wr_level = VOS_TS_TYPE_OBJ; + break; + case VOS_TS_WRITE_DKEY: + (*ts_set)->ts_wr_level = VOS_TS_TYPE_DKEY; + break; + case VOS_TS_WRITE_AKEY: + (*ts_set)->ts_wr_level = VOS_TS_TYPE_AKEY; + break; + default: + /** Already zero */ + break; + } (*ts_set)->ts_set_size = size; - uuid_copy((*ts_set)->ts_tx_id, *tx_id); + uuid_copy((*ts_set)->ts_tx_id.dti_uuid, tx_id->dti_uuid); + (*ts_set)->ts_tx_id.dti_hlc = tx_id->dti_hlc; return 0; } @@ -361,3 +380,47 @@ vos_ts_set_upgrade(struct vos_ts_set *ts_set) set_entry->se_entry = entry; } } + +static inline bool +vos_ts_check_conflict(daos_epoch_t read_time, const struct dtx_id *read_id, + daos_epoch_t write_time, const struct dtx_id *write_id) + +{ + if (write_time > read_time) + return false; + + if (write_time != read_time) + return true; + + if (read_id->dti_hlc != write_id->dti_hlc) + return true; + + return uuid_compare(read_id->dti_uuid, write_id->dti_uuid) != 0; +} + +bool +vos_ts_check_read_conflict(struct vos_ts_set *ts_set, int idx, + daos_epoch_t write_time) +{ + struct vos_ts_set_entry *se; + struct vos_ts_entry *entry; + + D_ASSERT(ts_set != NULL); + + se = &ts_set->ts_entries[idx]; + entry = se->se_entry; + + if (se->se_etype < ts_set->ts_wr_level) { + /* check the low time */ + return vos_ts_check_conflict(entry->te_ts.tp_ts_rl, + &entry->te_ts.tp_tx_rl, + write_time, &ts_set->ts_tx_id); + } + + D_ASSERT(se->se_etype == ts_set->ts_wr_level); + + /* check the high time */ + return vos_ts_check_conflict(entry->te_ts.tp_ts_rh, + &entry->te_ts.tp_tx_rh, + write_time, &ts_set->ts_tx_id); +} diff --git a/src/vos/vos_ts.h b/src/vos/vos_ts.h index cbae7cb245a..c965ce93681 100644 --- a/src/vos/vos_ts.h +++ b/src/vos/vos_ts.h @@ -30,6 +30,7 @@ #ifndef __VOS_TS__ #define __VOS_TS__ +#include #include #include @@ -50,6 +51,17 @@ struct vos_ts_info { uint32_t ti_count; }; +struct vos_ts_pair { + /** Low read time or read time for the object/key */ + daos_epoch_t tp_ts_rl; + /** High read time or read time for the object/key */ + daos_epoch_t tp_ts_rh; + /** Low read tx */ + struct dtx_id tp_tx_rl; + /** High read tx */ + struct dtx_id tp_tx_rh; +}; + struct vos_ts_entry { struct vos_ts_info *te_info; /** Key for current occupant */ @@ -58,22 +70,41 @@ struct vos_ts_entry { uint32_t *te_parent_ptr; /** negative entry cache */ uint32_t *te_miss_idx; - /** Low read time or read time for the object/key */ - daos_epoch_t te_ts_rl; - /** Max read time for subtrees */ - daos_epoch_t te_ts_rh; - /** uuid's of transactions. These can potentially be changed - * to 16 bits and save some space here. But for now, stick - * with the full id. - */ - /** Low read tx */ - uuid_t te_tx_rl; - /** high read tx */ - uuid_t te_tx_rh; + /** The timestamps for the entry */ + struct vos_ts_pair te_ts; /** Hash index in parent */ uint32_t te_hash_idx; }; +/** Check/update flags for a ts set entry */ +enum { + /** Mark operation as CONT read */ + VOS_TS_READ_CONT = (1 << 0), + /** Mark operation as OBJ read */ + VOS_TS_READ_OBJ = (1 << 1), + /** Mark operation as DKEY read */ + VOS_TS_READ_DKEY = (1 << 2), + /** Mark operation as AKEY read */ + VOS_TS_READ_AKEY = (1 << 3), + /** Read mask */ + VOS_TS_READ_MASK = (VOS_TS_READ_CONT | VOS_TS_READ_OBJ | + VOS_TS_READ_DKEY | VOS_TS_READ_AKEY), + /** Child of object mask */ + VOS_TS_READ_OBJ_CHILD = (VOS_TS_READ_OBJ | VOS_TS_READ_DKEY | + VOS_TS_READ_AKEY), + /** Child of dkey mask */ + VOS_TS_READ_DKEY_CHILD = (VOS_TS_READ_DKEY | VOS_TS_READ_AKEY), + /** Mark operation as OBJ write */ + VOS_TS_WRITE_OBJ = (1 << 4), + /** Mark operation as DKEY write */ + VOS_TS_WRITE_DKEY = (1 << 5), + /** Mark operation as AKEY write */ + VOS_TS_WRITE_AKEY = (1 << 6), + /** Write mask */ + VOS_TS_WRITE_MASK = (VOS_TS_WRITE_DKEY | VOS_TS_WRITE_AKEY | + VOS_TS_WRITE_OBJ), +}; + struct vos_ts_set_entry { /** Pointer to the entry at this level */ struct vos_ts_entry *se_entry; @@ -81,14 +112,24 @@ struct vos_ts_set_entry { uint32_t *se_create_idx; /** Cache of calculated hash for obj/key */ uint64_t se_hash; + /** The expected type of this entry. */ + uint32_t se_etype; }; /** Structure looking up and caching operation flags */ struct vos_ts_set { /** Operation flags */ uint64_t ts_flags; + /** type of next entry */ + uint64_t ts_etype; + /** The Check/update flags for the set */ + uint32_t ts_cflags; + /** Write level for the set */ + uint16_t ts_wr_level; + /** Max type */ + uint16_t ts_max_type; /** Transaction that owns the set */ - uuid_t ts_tx_id; + struct dtx_id ts_tx_id; /** size of the set */ uint32_t ts_set_size; /** Number of initialized entries */ @@ -121,9 +162,9 @@ struct vos_ts_table { /** Global read high timestamp for type */ daos_epoch_t tt_ts_rh; /** Transaciton id associated with global read low timestamp */ - uuid_t tt_tx_rl; + struct dtx_id tt_tx_rl; /** Transaciton id associated with global read high timestamp */ - uuid_t tt_tx_rh; + struct dtx_id tt_tx_rh; /** Miss index table */ uint32_t *tt_misses; /** Timestamp table pointers for a type */ @@ -252,7 +293,6 @@ vos_ts_alloc(struct vos_ts_set *ts_set, uint32_t *idx, uint64_t hash) if (ts_set == NULL) return NULL; - ts_table = vos_ts_table_get(); parent = ts_set_get_parent(ts_set); @@ -302,52 +342,6 @@ vos_ts_set_get_entry(struct vos_ts_set *ts_set) return entry->se_entry; } -/** Get the specified entry in the set - * - * \param[in] ts_set The timestamp set - * \param[in] type The type of entry - * \param[in] akey_idx 0 or index of the akey - * - * \return Returns the last entry added to the set or NULL - */ -static inline struct vos_ts_entry * -vos_ts_set_get_entry_type(struct vos_ts_set *ts_set, uint32_t type, - int akey_idx) -{ - struct vos_ts_set_entry *entry; - uint32_t idx = (type / 2) + akey_idx; - - D_ASSERT(akey_idx == 0 || type == VOS_TS_TYPE_AKEY); - - if (ts_set == NULL || idx >= ts_set->ts_init_count) - return NULL; - - entry = &ts_set->ts_entries[idx]; - return entry->se_entry; -} - -/** Set the index of the associated positive entry in the last entry - * in the set. - * - * \param[in] ts_set The timestamp set - * \param[in] idx Pointer to the index that will be used - * when allocating the positive entry - */ -static inline void -vos_ts_set_mark_entry(struct vos_ts_set *ts_set, uint32_t *idx) -{ - struct vos_ts_set_entry *entry; - - if (ts_set == NULL || ts_set->ts_init_count == 0) - return; - - entry = &ts_set->ts_entries[ts_set->ts_init_count - 1]; - - /** Should be a negative entry */ - D_ASSERT(entry->se_entry->te_info->ti_type & 1); - entry->se_create_idx = idx; -} - /** When a subtree doesn't exist, we need a negative entry. The entry in this * case is identified by a hash. This looks up the negative entry and * allocates it if necessary. Resets te_create_idx to NULL. @@ -408,6 +402,121 @@ vos_ts_get_negative(struct vos_ts_set *ts_set, uint64_t hash, bool reset) return neg_entry; } + +/** Set the type of the next entry. This gets set automatically + * by default in vos_ts_set_add to child type of entry being + * inserted so only required when this isn't suitable + * + * \param[in] ts_set The timestamp set + * \param[in] type The type of the next insertion + */ +static inline void +vos_ts_set_type(struct vos_ts_set *ts_set, uint32_t type) +{ + if (ts_set == NULL) + return; + + ts_set->ts_etype = type; +} + +static inline int +vos_ts_set_add(struct vos_ts_set *ts_set, uint32_t *idx, const void *rec, + size_t rec_size) +{ + struct vos_ts_set_entry *se; + struct vos_ts_entry *entry; + uint64_t hash = 0; + uint32_t expected_type; + + if (ts_set == NULL) + return 0; + + if (idx == NULL) + goto calc_hash; + + if (vos_ts_lookup(ts_set, idx, false, &entry)) { + expected_type = entry->te_info->ti_type; + D_ASSERT(expected_type == ts_set->ts_etype); + goto set_params; + } + +calc_hash: + if (ts_set->ts_etype > VOS_TS_TYPE_CONT) + hash = vos_hash_get(rec, rec_size); + + if (idx != NULL) { + entry = vos_ts_alloc(ts_set, idx, hash); + if (entry == NULL) + return -DER_NO_PERM; + expected_type = entry->te_info->ti_type; + D_ASSERT(expected_type == ts_set->ts_etype); + } else { + entry = vos_ts_get_negative(ts_set, hash, false); + D_ASSERT(entry != NULL); + expected_type = entry->te_info->ti_type + 1; + } + +set_params: + D_ASSERT(ts_set->ts_init_count >= 1); + se = &ts_set->ts_entries[ts_set->ts_init_count - 1]; + se->se_etype = ts_set->ts_etype; + if (se->se_etype > ts_set->ts_max_type) + ts_set->ts_max_type = se->se_etype; + if (expected_type != VOS_TS_TYPE_AKEY) + ts_set->ts_etype = expected_type + 2; + se->se_entry = entry; + se->se_hash = hash; + se->se_create_idx = NULL; + + return 0; +} + +/** Get the specified entry in the set + * + * \param[in] ts_set The timestamp set + * \param[in] type The type of entry + * \param[in] akey_idx 0 or index of the akey + * + * \return Returns the last entry added to the set or NULL + */ +static inline struct vos_ts_entry * +vos_ts_set_get_entry_type(struct vos_ts_set *ts_set, uint32_t type, + int akey_idx) +{ + struct vos_ts_set_entry *entry; + uint32_t idx = (type / 2) + akey_idx; + + D_ASSERT(akey_idx == 0 || type == VOS_TS_TYPE_AKEY); + + if (ts_set == NULL || idx >= ts_set->ts_init_count) + return NULL; + + entry = &ts_set->ts_entries[idx]; + return entry->se_entry; +} + +/** Set the index of the associated positive entry in the last entry + * in the set. + * + * \param[in] ts_set The timestamp set + * \param[in] idx Pointer to the index that will be used + * when allocating the positive entry + */ +static inline void +vos_ts_set_mark_entry(struct vos_ts_set *ts_set, uint32_t *idx) +{ + struct vos_ts_set_entry *entry; + + if (ts_set == NULL || ts_set->ts_init_count == 0) + return; + + entry = &ts_set->ts_entries[ts_set->ts_init_count - 1]; + + /** Should be a negative entry */ + D_ASSERT(entry->se_entry->te_info->ti_type & 1); + entry->se_create_idx = idx; +} + /** If an entry is still in the thread local timestamp cache, evict it and * update global timestamps for the type. Move the evicted entry to the LRU * and mark it as already evicted. @@ -445,6 +554,7 @@ vos_ts_table_free(struct vos_ts_table **ts_table); * * \param[in,out] ts_set Pointer to set * \param[in] flags Operations flags + * \param[in] cflags Check/update flags * \param[in] akey_nr Number of akeys in operation * \param[in] tx_id Optional transaction id * @@ -452,7 +562,8 @@ vos_ts_table_free(struct vos_ts_table **ts_table); */ int vos_ts_set_allocate(struct vos_ts_set **ts_set, uint64_t flags, - uint32_t akey_nr, uuid_t *tx_id); + uint32_t cflags, uint32_t akey_nr, + const struct dtx_id *tx_id); /** Upgrade any negative entries in the set now that the associated * update/punch has committed @@ -472,84 +583,127 @@ vos_ts_set_free(struct vos_ts_set *ts_set) D_FREE(ts_set); } -/** Update the low timestamp if the new read is newer - * - * \param[in] entry The timestamp entry - * \param[in] read_time The new read timestamp - * \param[in] tx_id The uuid of the new read - */ +/** Internal API to copy timestamp */ +static inline void +vos_ts_copy(daos_epoch_t *dest_epc, struct dtx_id *dest_id, + daos_epoch_t src_epc, const struct dtx_id *src_id) +{ + *dest_epc = src_epc; + uuid_copy(dest_id->dti_uuid, src_id->dti_uuid); + dest_id->dti_hlc = src_id->dti_hlc; +} + +/** Internal API to update low read timestamp and tx id */ static inline void vos_ts_rl_update(struct vos_ts_entry *entry, daos_epoch_t read_time, - const uuid_t tx_id) + const struct dtx_id *tx_id) { - if (entry == NULL || read_time < entry->te_ts_rl) + if (entry == NULL || read_time < entry->te_ts.tp_ts_rl) return; - entry->te_ts_rl = read_time; - uuid_copy(entry->te_tx_rl, tx_id); + vos_ts_copy(&entry->te_ts.tp_ts_rl, &entry->te_ts.tp_tx_rl, + read_time, tx_id); } -/** Update the low timestamp if the new read is newer - * - * \param[in] entry The timestamp entry - * \param[in] read_time The new read timestamp - * \param[in] tx_id The uuid of the new read - */ +/** Internal API to update high read timestamp and tx id */ static inline void vos_ts_rh_update(struct vos_ts_entry *entry, daos_epoch_t read_time, - const uuid_t tx_id) + const struct dtx_id *tx_id) { - if (entry == NULL || read_time < entry->te_ts_rh) + if (entry == NULL || read_time < entry->te_ts.tp_ts_rh) return; - entry->te_ts_rh = read_time; - uuid_copy(entry->te_tx_rh, tx_id); + vos_ts_copy(&entry->te_ts.tp_ts_rh, &entry->te_ts.tp_tx_rh, + read_time, tx_id); } -/** Check the read low timestamp at current entry. +/** Internal API to check read conflict of a given entry */ +bool +vos_ts_check_read_conflict(struct vos_ts_set *ts_set, int idx, + daos_epoch_t write_time); + +/** Checks the set for read/write conflicts * - * \param[in] ts_set The timestamp set - * \param[in] write_time The write time + * \param[in] ts_set The timestamp read set + * \param[in] write_time The time of the update * - * \return true Conflict - * false No conflict (or no timestamp set) + * \return true Conflict + * false No conflict (or no timestamp set) */ -static inline bool -vos_ts_check_rl_conflict(struct vos_ts_set *ts_set, daos_epoch_t write_time) +static inline int +vos_ts_set_check_conflict(struct vos_ts_set *ts_set, daos_epoch_t write_time) { - struct vos_ts_entry *entry; + int i; - entry = vos_ts_set_get_entry(ts_set); - if (entry == NULL || write_time > entry->te_ts_rl) + if (ts_set == NULL) return false; - if (write_time != entry->te_ts_rl) - return true; + if ((ts_set->ts_cflags & VOS_TS_WRITE_MASK) == 0) + return false; + + for (i = 0; i < ts_set->ts_init_count; i++) { + /** Will check the appropriate read timestamp based on the type + * of the entry at index i. + */ + if (vos_ts_check_read_conflict(ts_set, i, write_time)) + return true; + } - return uuid_compare(ts_set->ts_tx_id, entry->te_tx_rl) != 0; + return false; } -/** Check the read high timestamp at current entry. - * - * \param[in] ts_set The timestamp set - * \param[in] write_time The write time +/** Update the read timestamps for the set after a successful operation * - * \return true Conflict - * false No conflict (or no timestamp set) + * \param[in] ts_set The timestamp set + * \param[in] read_time The new read timestamp */ -static inline bool -vos_ts_check_rh_conflict(struct vos_ts_set *ts_set, daos_epoch_t write_time) +static inline void +vos_ts_set_update(struct vos_ts_set *ts_set, daos_epoch_t read_time) { - struct vos_ts_entry *entry; + struct vos_ts_set_entry *se; + uint32_t high_mask = 0; + uint32_t low_mask = 0; + int i; - entry = vos_ts_set_get_entry(ts_set); - if (entry == NULL || write_time > entry->te_ts_rh) - return false; + if (ts_set == NULL) + return; - if (write_time != entry->te_ts_rh) - return true; + if ((ts_set->ts_cflags & VOS_TS_READ_MASK) == 0) + return; + + for (i = 0; i < ts_set->ts_init_count; i++) { + se = &ts_set->ts_entries[i]; + + switch (se->se_etype) { + case VOS_TS_TYPE_CONT: + high_mask = VOS_TS_READ_MASK; + low_mask = VOS_TS_READ_CONT; + break; + case VOS_TS_TYPE_OBJ: + high_mask = VOS_TS_READ_OBJ_CHILD; + low_mask = VOS_TS_READ_OBJ; + if (ts_set->ts_max_type > VOS_TS_TYPE_OBJ) + break; + case VOS_TS_TYPE_DKEY: + high_mask |= VOS_TS_READ_DKEY_CHILD; + low_mask |= VOS_TS_READ_DKEY; + if (ts_set->ts_max_type > VOS_TS_TYPE_DKEY) + break; + case VOS_TS_TYPE_AKEY: + high_mask |= VOS_TS_READ_AKEY; + low_mask |= VOS_TS_READ_AKEY; + break; + default: + D_ASSERT(0); + } - return uuid_compare(ts_set->ts_tx_id, entry->te_tx_rh) != 0; + if (ts_set->ts_cflags & high_mask) + vos_ts_rh_update(se->se_entry, read_time, + &ts_set->ts_tx_id); + if (ts_set->ts_cflags & low_mask) + vos_ts_rl_update(se->se_entry, read_time, + &ts_set->ts_tx_id); + } } #endif /* __VOS_TS__ */