diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 89d43ede944..fae74fd3de4 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -14,11 +14,16 @@ jobs: uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} + - uses: actions/setup-python@v5 + with: + python-version: '3.11' - name: Install core python packages run: python3 -m pip install --requirement requirements.txt - name: Install extra python packages run: python3 -m pip install --requirement utils/cq/requirements.txt - name: Install enchant run: sudo apt-get update && sudo apt-get -y install python3-enchant + - name: Show versions + run: ./utils/cq/daos_pylint.py --version - name: Run pylint check. run: ./utils/cq/daos_pylint.py --git --output-format github diff --git a/Jenkinsfile b/Jenkinsfile index 2f8f9f774d2..c3b4b58d54c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1040,78 +1040,78 @@ pipeline { } } } // stage('Fault inection testing on EL 8.8') - // stage('Test RPMs on EL 8.6') { - // when { - // beforeAgent true - // expression { ! skipStage() } - // } - // agent { - // label params.CI_UNIT_VM1_LABEL - // } - // steps { - // job_step_update( - // testRpm(inst_repos: daosRepos(), - // daos_pkg_version: daosPackagesVersion(next_version)) - // ) - // } - // post { - // always { - // rpm_test_post(env.STAGE_NAME, env.NODELIST) - // } - // } - // } // stage('Test CentOS 7 RPMs') - // stage('Test RPMs on Leap 15.4') { - // when { - // beforeAgent true - // expression { ! skipStage() } - // } - // agent { - // label params.CI_UNIT_VM1_LABEL - // } - // steps { - // /* neither of these work as FTest strips the first node - // out of the pool requiring 2 node clusters at minimum - // * additionally for this use-case, can't override - // ftest_arg with this :-( - // script { - // 'Test RPMs on Leap 15.4': getFunctionalTestStage( - // name: 'Test RPMs on Leap 15.4', - // pragma_suffix: '', - // label: params.CI_UNIT_VM1_LABEL, - // next_version: next_version, - // stage_tags: '', - // default_tags: 'test_daos_management', - // nvme: 'auto', - // run_if_pr: true, - // run_if_landing: true, - // job_status: job_status_internal - // ) - // } - // job_step_update( - // functionalTest( - // test_tag: 'test_daos_management', - // ftest_arg: '--yaml_extension single_host', - // inst_repos: daosRepos(), - // inst_rpms: functionalPackages(1, next_version, 'tests-internal'), - // test_function: 'runTestFunctionalV2')) - // } - // post { - // always { - // functionalTestPostV2() - // job_status_update() - // } - // } */ - // job_step_update( - // testRpm(inst_repos: daosRepos(), - // daos_pkg_version: daosPackagesVersion(next_version)) - // ) - // } - // post { - // always { - // rpm_test_post(env.STAGE_NAME, env.NODELIST) - // } - // } - // } // stage('Test Leap 15 RPMs') + stage('Test RPMs on EL 8.6') { + when { + beforeAgent true + expression { ! skipStage() } + } + agent { + label params.CI_UNIT_VM1_LABEL + } + steps { + job_step_update( + testRpm(inst_repos: daosRepos(), + daos_pkg_version: daosPackagesVersion(next_version)) + ) + } + post { + always { + rpm_test_post(env.STAGE_NAME, env.NODELIST) + } + } + } // stage('Test CentOS 7 RPMs') + stage('Test RPMs on Leap 15.4') { + when { + beforeAgent true + expression { ! skipStage() } + } + agent { + label params.CI_UNIT_VM1_LABEL + } + steps { + /* neither of these work as FTest strips the first node + out of the pool requiring 2 node clusters at minimum + * additionally for this use-case, can't override + ftest_arg with this :-( + script { + 'Test RPMs on Leap 15.4': getFunctionalTestStage( + name: 'Test RPMs on Leap 15.4', + pragma_suffix: '', + label: params.CI_UNIT_VM1_LABEL, + next_version: next_version, + stage_tags: '', + default_tags: 'test_daos_management', + nvme: 'auto', + run_if_pr: true, + run_if_landing: true, + job_status: job_status_internal + ) + } + job_step_update( + functionalTest( + test_tag: 'test_daos_management', + ftest_arg: '--yaml_extension single_host', + inst_repos: daosRepos(), + inst_rpms: functionalPackages(1, next_version, 'tests-internal'), + test_function: 'runTestFunctionalV2')) + } + post { + always { + functionalTestPostV2() + job_status_update() + } + } */ + job_step_update( + testRpm(inst_repos: daosRepos(), + daos_pkg_version: daosPackagesVersion(next_version)) + ) + } + post { + always { + rpm_test_post(env.STAGE_NAME, env.NODELIST) + } + } + } // stage('Test Leap 15 RPMs') } // parallel } // stage('Test') stage('Test Storage Prep on EL 8.8') { @@ -1202,7 +1202,7 @@ pipeline { stage_tags: 'hw,medium,provider', default_tags: startedByTimer() ? 'pr daily_regression' : 'pr', default_nvme: 'auto', - provider: 'ucx+dc_x', + provider: cachedCommitPragma('Test-provider-ucx', 'ucx+ud_x'), run_if_pr: false, run_if_landing: false, job_status: job_status_internal diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh index b2dfc21b43d..d318b3601e3 100755 --- a/ci/functional/test_main.sh +++ b/ci/functional/test_main.sh @@ -16,14 +16,17 @@ test_tag="$TEST_TAG" tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT") first_node=${NODELIST%%,*} +hardware_ok=false + cluster_reboot () { # shellcheck disable=SC2029,SC2089 clush -B -S -o '-i ci_key' -l root -w "${tnodes}" reboot || true # shellcheck disable=SC2029,SC2089 poll_cmd=( clush -B -S -o "-i ci_key" -l root -w "${tnodes}" ) - poll_cmd+=( '"cat /etc/os-release"' ) - reboot_timeout=900 # 15 minutes + poll_cmd+=( cat /etc/os-release ) + # 20 minutes, HPE systems may take more than 15 minutes. + reboot_timeout=1200 retry_wait=10 # seconds timeout=$((SECONDS + reboot_timeout)) while [ "$SECONDS" -lt "$timeout" ]; do @@ -42,6 +45,8 @@ test_cluster() { FIRST_NODE=${first_node} \ TEST_RPMS=${TEST_RPMS} \ NODELIST=${tnodes} \ + BUILD_URL=\"$BUILD_URL\" \ + STAGE_NAME=\"$STAGE_NAME\" \ $(cat ci/functional/test_main_prep_node.sh)" } @@ -50,8 +55,13 @@ clush -B -S -o '-i ci_key' -l root -w "${first_node}" \ if ! test_cluster; then # Sometimes a cluster reboot will fix the issue so try it once. - cluster_reboot - test_cluster + if cluster_reboot; then + if test_cluster; then + hardware_ok=true + fi + fi +else + hardware_ok=true fi # collect the _results.xml files from test_main_prep_nodes before they @@ -79,17 +89,20 @@ export DAOS_TARGET_OVERSUBSCRIBE=1 rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results -if $TEST_RPMS; then - # shellcheck disable=SC2029 - ssh -i ci_key -l jenkins "${first_node}" \ - "TEST_TAG=\"$test_tag\" \ - TNODES=\"$tnodes\" \ - FTEST_ARG=\"${FTEST_ARG:-}\" \ - WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ - STAGE_NAME=\"$STAGE_NAME\" \ - $(cat ci/functional/test_main_node.sh)" -else - ./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG" + +if "$hardware_ok"; then + if $TEST_RPMS; then + # shellcheck disable=SC2029 + ssh -i ci_key -l jenkins "${first_node}" \ + "TEST_TAG=\"$test_tag\" \ + TNODES=\"$tnodes\" \ + FTEST_ARG=\"${FTEST_ARG:-}\" \ + WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ + STAGE_NAME=\"$STAGE_NAME\" \ + $(cat ci/functional/test_main_node.sh)" + else + ./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG" + fi fi # Now rename the previously collected hardware test data for Jenkins @@ -104,3 +117,4 @@ for node in ${tnodes//,/ }; do mv "$old_name" "$new_name" fi done +"$hardware_ok" diff --git a/ci/functional/test_main_prep_node.sh b/ci/functional/test_main_prep_node.sh index 85f3e51aa0f..32993d114e4 100755 --- a/ci/functional/test_main_prep_node.sh +++ b/ci/functional/test_main_prep_node.sh @@ -8,6 +8,8 @@ set -eux : "${FIRST_NODE:=}" : "${OPERATIONS_EMAIL:=}" +: "${STAGE_NAME:=Unknown}" +: "${BUILD_URL:=Unknown}" result=0 mail_message='' @@ -38,9 +40,10 @@ function do_mail { return fi # shellcheck disable=SC2059 + build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl" mail -s "Hardware check failed after reboot!" \ -r "$HOSTNAME"@intel.com "$OPERATIONS_EMAIL" \ - <<< "$mail_message" + <<< "$build_info$mail_message" set -x } @@ -242,7 +245,7 @@ if [ -e /sys/class/net/ib1 ]; then testcases+=" $nl" ((testruns++)) || true - testcases+=" ${nl}" + testcases+=" ${nl}" if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen." mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl" diff --git a/ci/provisioning/post_provision_config.sh b/ci/provisioning/post_provision_config.sh index 5da9c8628f4..11b308f513e 100755 --- a/ci/provisioning/post_provision_config.sh +++ b/ci/provisioning/post_provision_config.sh @@ -19,6 +19,9 @@ source ci/provisioning/post_provision_config_common_functions.sh # shellcheck disable=SC1091 source ci/junit.sh + +: "${MLNX_VER_NUM:=latest-5.8}" + : "${DISTRO:=EL_7}" DSL_REPO_var="DAOS_STACK_${DISTRO}_LOCAL_REPO" DSG_REPO_var="DAOS_STACK_${DISTRO}_GROUP_REPO" @@ -44,6 +47,7 @@ if ! retry_cmd 2400 clush -B -S -l root -w "$NODESTRING" \ DISTRO=\"$DISTRO\" DAOS_STACK_RETRY_DELAY_SECONDS=\"$DAOS_STACK_RETRY_DELAY_SECONDS\" DAOS_STACK_RETRY_COUNT=\"$DAOS_STACK_RETRY_COUNT\" + MLNX_VER_NUM=\"$MLNX_VER_NUM\" BUILD_URL=\"$BUILD_URL\" STAGE_NAME=\"$STAGE_NAME\" OPERATIONS_EMAIL=\"$OPERATIONS_EMAIL\" diff --git a/ci/provisioning/post_provision_config_common.sh b/ci/provisioning/post_provision_config_common.sh index 0770e24ffeb..06ad80b984a 100755 --- a/ci/provisioning/post_provision_config_common.sh +++ b/ci/provisioning/post_provision_config_common.sh @@ -53,6 +53,3 @@ case "$ID_LIKE" in EXCLUDE_UPGRADE+=,fuse,fuse-libs,fuse-devel ;; esac - -# shellcheck disable=SC2034 -MLNX_VER_NUM=5.8-3.0.7.0 \ No newline at end of file diff --git a/ci/provisioning/post_provision_config_nodes_EL_8.sh b/ci/provisioning/post_provision_config_nodes_EL_8.sh index 00c9ad72f9a..0c73e1d0c96 100644 --- a/ci/provisioning/post_provision_config_nodes_EL_8.sh +++ b/ci/provisioning/post_provision_config_nodes_EL_8.sh @@ -55,19 +55,21 @@ install_mofed() { stream=false gversion="$VERSION_ID" if [ "$gversion" == "8" ]; then - gversion="8.6" + # Mellanox does not have a release for 8.9 yet. + gversion="8.8" stream=true elif [[ $gversion = *.*.* ]]; then gversion="${gversion%.*}" fi # Add a repo to install MOFED RPMS - repo_url=https://artifactory.dc.hpdd.intel.com/artifactory/mlnx_ofed/"$MLNX_VER_NUM-rhel$gversion"-x86_64/ + artifactory_base_url="https://artifactory.dc.hpdd.intel.com/artifactory/" + mellanox_proxy="${artifactory_base_url}mellanox-proxy/mlnx_ofed/" + mellanox_key_url="${artifactory_base_url}mlnx_ofed/RPM-GPG-KEY-Mellanox" + rpm --import "$mellanox_key_url" + repo_url="$mellanox_proxy$MLNX_VER_NUM/rhel$gversion/x86_64/" dnf -y config-manager --add-repo="$repo_url" - curl -L -O "$repo_url"RPM-GPG-KEY-Mellanox dnf -y config-manager --save --setopt="$(url_to_repo "$repo_url")".gpgcheck=1 - rpm --import RPM-GPG-KEY-Mellanox - rm -f RPM-GPG-KEY-Mellanox dnf repolist || true time dnf -y install mlnx-ofed-basic ucx-cma ucx-ib ucx-knem ucx-rdmacm ucx-xpmem diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index 4676cabaf2c..5bd02cd386c 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -279,6 +279,7 @@ struct bio_dev_health { void *bdh_intel_smart_buf; /*Intel SMART attributes*/ uint64_t bdh_stat_age; unsigned int bdh_inflights; + unsigned int bdh_stopping:1; uint16_t bdh_vendor_id; /* PCI vendor ID */ /** diff --git a/src/bio/bio_monitor.c b/src/bio/bio_monitor.c index 568d3d89ba2..7a6f94857d5 100644 --- a/src/bio/bio_monitor.c +++ b/src/bio/bio_monitor.c @@ -233,33 +233,19 @@ bio_dev_set_faulty(struct bio_xs_context *xs, uuid_t dev_uuid) return rc; } -static inline struct bio_dev_health * -cb_arg2dev_health(void *cb_arg) -{ - - struct bio_xs_blobstore **bxb_ptr = (struct bio_xs_blobstore **)cb_arg; - struct bio_xs_blobstore *bxb; - - bxb = *bxb_ptr; - /* bio_xsctxt_free() is underway */ - if (bxb == NULL) - return NULL; - - return &bxb->bxb_blobstore->bb_dev_health; -} - static void get_spdk_err_log_page_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { - struct bio_dev_health *dev_health = cb_arg2dev_health(cb_arg); + struct bio_dev_health *dev_health = cb_arg; int sc, sct; uint32_t cdw0; - if (dev_health == NULL) - goto out; - D_ASSERT(dev_health->bdh_inflights == 1); + if (dev_health->bdh_stopping) { + dev_health->bdh_inflights--; + goto out; + } /* Additional NVMe status information */ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); @@ -277,7 +263,7 @@ static void get_spdk_identify_ctrlr_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { - struct bio_dev_health *dev_health = cb_arg2dev_health(cb_arg); + struct bio_dev_health *dev_health = cb_arg; struct spdk_nvme_ctrlr_data *cdata; struct spdk_bdev *bdev; struct spdk_nvme_cmd cmd; @@ -288,10 +274,11 @@ get_spdk_identify_ctrlr_completion(struct spdk_bdev_io *bdev_io, bool success, int sc, sct; uint32_t cdw0; - if (dev_health == NULL) - goto out; - D_ASSERT(dev_health->bdh_inflights == 1); + if (dev_health->bdh_stopping) { + dev_health->bdh_inflights--; + goto out; + } /* Additional NVMe status information */ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); @@ -563,17 +550,18 @@ static void get_spdk_intel_smart_log_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { - struct bio_dev_health *dev_health = cb_arg2dev_health(cb_arg); + struct bio_dev_health *dev_health = cb_arg; struct spdk_bdev *bdev; struct spdk_nvme_cmd cmd; uint32_t cp_sz; int rc, sc, sct; uint32_t cdw0; - if (dev_health == NULL) - goto out; - D_ASSERT(dev_health->bdh_inflights == 1); + if (dev_health->bdh_stopping) { + dev_health->bdh_inflights--; + goto out; + } /* Additional NVMe status information */ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); @@ -622,7 +610,7 @@ static void get_spdk_health_info_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { - struct bio_dev_health *dev_health = cb_arg2dev_health(cb_arg); + struct bio_dev_health *dev_health = cb_arg; struct spdk_bdev *bdev; struct spdk_nvme_cmd cmd; uint32_t page_sz; @@ -630,10 +618,11 @@ get_spdk_health_info_completion(struct spdk_bdev_io *bdev_io, bool success, int rc, sc, sct; uint32_t cdw0; - if (dev_health == NULL) - goto out; - D_ASSERT(dev_health->bdh_inflights == 1); + if (dev_health->bdh_stopping) { + dev_health->bdh_inflights--; + goto out; + } /* Additional NVMe status information */ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); @@ -770,7 +759,7 @@ auto_faulty_detect(struct bio_blobstore *bbs) static void collect_raw_health_data(void *cb_arg) { - struct bio_dev_health *dev_health = cb_arg2dev_health(cb_arg); + struct bio_dev_health *dev_health = cb_arg; struct spdk_bdev *bdev; struct spdk_nvme_cmd cmd; uint32_t numd, numdl, numdu; @@ -863,8 +852,8 @@ bio_bs_monitor(struct bio_xs_context *xs_ctxt, enum smd_dev_type st, uint64_t no DL_ERROR(rc, "State transition on target %d failed", bbs->bb_owner_xs->bxc_tgt_id); - if (!bypass_health_collect()) - collect_raw_health_data((void *)&xs_ctxt->bxc_xs_blobstores[st]); + if (!bypass_health_collect() && !dev_health->bdh_stopping) + collect_raw_health_data((void *)dev_health); } /* Free all device health monitoring info */ @@ -876,6 +865,7 @@ bio_fini_health_monitoring(struct bio_xs_context *ctxt, struct bio_blobstore *bb /* Drain the in-flight request before putting I/O channel */ D_ASSERT(bdh->bdh_inflights < 2); + bdh->bdh_stopping = 1; if (bdh->bdh_inflights > 0) { D_INFO("Wait for health collecting done...\n"); rc = xs_poll_completion(ctxt, &bdh->bdh_inflights, 0); diff --git a/src/bio/bio_xstream.c b/src/bio/bio_xstream.c index bcdedd3b274..1b57241a844 100644 --- a/src/bio/bio_xstream.c +++ b/src/bio/bio_xstream.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -196,8 +196,8 @@ set_faulty_criteria(void) glb_criteria.fc_max_csum_errs = UINT32_MAX; d_getenv_bool("DAOS_NVME_AUTO_FAULTY_ENABLED", &glb_criteria.fc_enabled); - d_getenv_int("DAOS_NVME_AUTO_FAULTY_IO", &glb_criteria.fc_max_io_errs); - d_getenv_int("DAOS_NVME_AUTO_FAULTY_CSUM", &glb_criteria.fc_max_csum_errs); + d_getenv_uint32_t("DAOS_NVME_AUTO_FAULTY_IO", &glb_criteria.fc_max_io_errs); + d_getenv_uint32_t("DAOS_NVME_AUTO_FAULTY_CSUM", &glb_criteria.fc_max_csum_errs); D_INFO("NVMe auto faulty is %s. Criteria: max_io_errs:%u, max_csum_errs:%u\n", glb_criteria.fc_enabled ? "enabled" : "disabled", @@ -249,15 +249,15 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size, d_getenv_bool("DAOS_SCM_RDMA_ENABLED", &bio_scm_rdma); D_INFO("RDMA to SCM is %s\n", bio_scm_rdma ? "enabled" : "disabled"); - d_getenv_int("DAOS_SPDK_SUBSYS_TIMEOUT", &bio_spdk_subsys_timeout); + d_getenv_uint("DAOS_SPDK_SUBSYS_TIMEOUT", &bio_spdk_subsys_timeout); D_INFO("SPDK subsystem fini timeout is %u ms\n", bio_spdk_subsys_timeout); - d_getenv_int("DAOS_SPDK_MAX_UNMAP_CNT", &bio_spdk_max_unmap_cnt); + d_getenv_uint("DAOS_SPDK_MAX_UNMAP_CNT", &bio_spdk_max_unmap_cnt); if (bio_spdk_max_unmap_cnt == 0) bio_spdk_max_unmap_cnt = UINT32_MAX; D_INFO("SPDK batch blob unmap call count is %u\n", bio_spdk_max_unmap_cnt); - d_getenv_int("DAOS_MAX_ASYNC_SZ", &bio_max_async_sz); + d_getenv_uint("DAOS_MAX_ASYNC_SZ", &bio_max_async_sz); D_INFO("Max async data size is set to %u bytes\n", bio_max_async_sz); /* Hugepages disabled */ @@ -1421,20 +1421,6 @@ init_xs_blobstore_ctxt(struct bio_xs_context *ctxt, int tgt_id, enum smd_dev_typ return 0; } -static void -bio_blobstore_free(struct bio_xs_blobstore *bxb, struct bio_xs_context *ctxt) -{ - - struct bio_blobstore *bbs = bxb->bxb_blobstore; - - if (bbs == NULL) - return; - - put_bio_blobstore(bxb, ctxt); - if (is_bbs_owner(ctxt, bbs)) - bio_fini_health_monitoring(ctxt, bbs); -} - /* * Finalize per-xstream NVMe context and SPDK env. * @@ -1463,14 +1449,14 @@ bio_xsctxt_free(struct bio_xs_context *ctxt) bxb->bxb_io_channel = NULL; } - /* - * Clear bxc_xs_blobstore[st] before bio_blobstore_free() to prevent the health - * monitor from issuing health data collecting request, see cb_arg2dev_health(). - */ ctxt->bxc_xs_blobstores[st] = NULL; if (bxb->bxb_blobstore != NULL) { - bio_blobstore_free(bxb, ctxt); + put_bio_blobstore(bxb, ctxt); + + if (is_bbs_owner(ctxt, bxb->bxb_blobstore)) + bio_fini_health_monitoring(ctxt, bxb->bxb_blobstore); + bxb->bxb_blobstore = NULL; } D_FREE(bxb); diff --git a/src/cart/README.env b/src/cart/README.env index 58df8cf2baf..3d4a12963df 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -139,6 +139,12 @@ This file lists the environment variables used in CaRT. It its value exceed 256, then will use 256 for flow control. Set it to zero means disable the flow control in cart. + . D_QUOTA_RPCS + Set it as the max number of per-context inflight RPCs that a sender will send + onto a wire. Quota on each context is independent of each other. + If it is not set the default value of 64 is used. + Setting it to 0 disables quota + . CRT_CTX_SHARE_ADDR Set it to non-zero to make all the contexts share one network address, in this case CaRT will create one SEP and each context maps to one tx/rx diff --git a/src/cart/crt_context.c b/src/cart/crt_context.c index fdcd73c8ec4..78702e25b38 100644 --- a/src/cart/crt_context.c +++ b/src/cart/crt_context.c @@ -11,6 +11,11 @@ #include "crt_internal.h" static void crt_epi_destroy(struct crt_ep_inflight *epi); +static int context_quotas_init(crt_context_t crt_ctx); +static int context_quotas_finalize(crt_context_t crt_ctx); + +static inline int get_quota_resource(crt_context_t crt_ctx, crt_quota_type_t quota); +static inline void put_quota_resource(crt_context_t crt_ctx, crt_quota_type_t quota); static struct crt_ep_inflight * epi_link2ptr(d_list_t *rlink) @@ -141,6 +146,13 @@ crt_context_init(crt_context_t crt_ctx) if (rc != 0) D_GOTO(out, rc); + rc = D_MUTEX_INIT(&ctx->cc_quotas.mutex, NULL); + if (rc != 0) { + D_MUTEX_DESTROY(&ctx->cc_mutex); + D_GOTO(out, rc); + } + + D_INIT_LIST_HEAD(&ctx->cc_quotas.rpc_waitq); D_INIT_LIST_HEAD(&ctx->cc_link); /* create timeout binheap */ @@ -162,6 +174,8 @@ crt_context_init(crt_context_t crt_ctx) D_GOTO(out_binheap_destroy, rc); } + rc = context_quotas_init(crt_ctx); + D_GOTO(out, rc); out_binheap_destroy: @@ -684,10 +698,17 @@ crt_context_destroy(crt_context_t crt_ctx, int force) D_GOTO(out, rc = -DER_UNINIT); } + rc = context_quotas_finalize(crt_ctx); + if (rc) { + DL_ERROR(rc, "context_quotas_finalize() failed"); + if (!force) + D_GOTO(out, rc); + } + ctx = crt_ctx; rc = crt_grp_ctx_invalid(ctx, false /* locked */); if (rc) { - D_ERROR("crt_grp_ctx_invalid failed, rc: %d.\n", rc); + DL_ERROR(rc, "crt_grp_ctx_invalid() failed"); if (!force) D_GOTO(out, rc); } @@ -1167,6 +1188,7 @@ crt_context_req_track(struct crt_rpc_priv *rpc_priv) d_list_t *rlink; d_rank_t ep_rank; int rc = 0; + int quota_rc = 0; struct crt_grp_priv *grp_priv; D_ASSERT(crt_ctx != NULL); @@ -1177,6 +1199,9 @@ crt_context_req_track(struct crt_rpc_priv *rpc_priv) D_GOTO(out, rc = CRT_REQ_TRACK_IN_INFLIGHQ); } + /* check inflight quota. if exceeded, queue this rpc */ + quota_rc = get_quota_resource(rpc_priv->crp_pub.cr_ctx, CRT_QUOTA_RPCS); + grp_priv = crt_grp_pub2priv(rpc_priv->crp_pub.cr_ep.ep_grp); ep_rank = crt_grp_priv_get_primary_rank(grp_priv, rpc_priv->crp_pub.cr_ep.ep_rank); @@ -1228,15 +1253,16 @@ crt_context_req_track(struct crt_rpc_priv *rpc_priv) rpc_priv->crp_epi = epi; RPC_ADDREF(rpc_priv); - if (crt_gdata.cg_credit_ep_ctx != 0 && + if (quota_rc == -DER_QUOTA_LIMIT) { + epi->epi_req_num++; + rpc_priv->crp_state = RPC_STATE_QUEUED; + rc = CRT_REQ_TRACK_IN_WAITQ; + } else if (crt_gdata.cg_credit_ep_ctx != 0 && (epi->epi_req_num - epi->epi_reply_num) >= crt_gdata.cg_credit_ep_ctx) { - if (rpc_priv->crp_opc_info->coi_queue_front) { - d_list_add(&rpc_priv->crp_epi_link, - &epi->epi_req_waitq); - } else { - d_list_add_tail(&rpc_priv->crp_epi_link, - &epi->epi_req_waitq); - } + if (rpc_priv->crp_opc_info->coi_queue_front) + d_list_add(&rpc_priv->crp_epi_link, &epi->epi_req_waitq); + else + d_list_add_tail(&rpc_priv->crp_epi_link, &epi->epi_req_waitq); epi->epi_req_wait_num++; rpc_priv->crp_state = RPC_STATE_QUEUED; @@ -1246,13 +1272,11 @@ crt_context_req_track(struct crt_rpc_priv *rpc_priv) rc = crt_req_timeout_track(rpc_priv); D_MUTEX_UNLOCK(&crt_ctx->cc_mutex); if (rc == 0) { - d_list_add_tail(&rpc_priv->crp_epi_link, - &epi->epi_req_q); + d_list_add_tail(&rpc_priv->crp_epi_link, &epi->epi_req_q); epi->epi_req_num++; rc = CRT_REQ_TRACK_IN_INFLIGHQ; } else { - RPC_ERROR(rpc_priv, - "crt_req_timeout_track failed, rc: %d.\n", rc); + RPC_ERROR(rpc_priv, "crt_req_timeout_track failed, rc: %d.\n", rc); /* roll back the addref above */ RPC_DECREF(rpc_priv); } @@ -1264,6 +1288,10 @@ crt_context_req_track(struct crt_rpc_priv *rpc_priv) /* reference taken by d_hash_rec_find or "epi->epi_ref = 1" above */ D_MUTEX_LOCK(&crt_ctx->cc_mutex); d_hash_rec_decref(&crt_ctx->cc_epi_table, &epi->epi_link); + + if (quota_rc == -DER_QUOTA_LIMIT) + d_list_add_tail(&rpc_priv->crp_waitq_link, &crt_ctx->cc_quotas.rpc_waitq); + D_MUTEX_UNLOCK(&crt_ctx->cc_mutex); out: @@ -1280,9 +1308,10 @@ credits_available(struct crt_ep_inflight *epi) { int64_t inflight = epi->epi_req_num - epi->epi_reply_num; - D_ASSERTF(inflight >= 0 && inflight <= crt_gdata.cg_credit_ep_ctx, - "req_num=%ld reply_num=%ld credit_ep_ctx=%u\n", epi->epi_req_num, - epi->epi_reply_num, crt_gdata.cg_credit_ep_ctx); + /* TODO: inflight right now includes items queued in quota waitq, and can exceed credit limit */ + if (inflight > crt_gdata.cg_credit_ep_ctx) + return 0; + return crt_gdata.cg_credit_ep_ctx - inflight; } @@ -1324,6 +1353,7 @@ crt_context_req_untrack_internal(struct crt_rpc_priv *rpc_priv) } else {/* RPC_CANCELED or RPC_INITED or RPC_TIMEOUT */ epi->epi_req_num--; } + D_ASSERT(epi->epi_req_num >= epi->epi_reply_num); D_MUTEX_UNLOCK(&epi->epi_mutex); @@ -1340,6 +1370,27 @@ crt_context_req_untrack_internal(struct crt_rpc_priv *rpc_priv) RPC_DECREF(rpc_priv); } +static void +dispatch_rpc(struct crt_rpc_priv *rpc) { + int rc; + + D_ASSERTF(rpc != NULL, "rpc is NULL\n"); + + crt_rpc_lock(rpc); + + rc = crt_req_send_internal(rpc); + if (rc == 0) { + crt_rpc_unlock(rpc); + } else { + RPC_ADDREF(rpc); + RPC_ERROR(rpc, "crt_req_send_internal failed, rc: %d\n", rc); + rpc->crp_state = RPC_STATE_INITED; + crt_context_req_untrack_internal(rpc); + /* for error case here */ + crt_rpc_complete_and_unlock(rpc, rc); + } +} + void crt_context_req_untrack(struct crt_rpc_priv *rpc_priv) { @@ -1351,17 +1402,26 @@ crt_context_req_untrack(struct crt_rpc_priv *rpc_priv) D_ASSERT(crt_ctx != NULL); - if (rpc_priv->crp_pub.cr_opc == CRT_OPC_URI_LOOKUP) { - RPC_TRACE(DB_NET, rpc_priv, "bypass untracking for URI_LOOKUP.\n"); + if (rpc_priv->crp_pub.cr_opc == CRT_OPC_URI_LOOKUP) return; - } epi = rpc_priv->crp_epi; D_ASSERT(epi != NULL); + /* Dispatch one rpc from wait_q if any or return resource back */ + D_MUTEX_LOCK(&crt_ctx->cc_mutex); + tmp_rpc = d_list_pop_entry(&crt_ctx->cc_quotas.rpc_waitq, + struct crt_rpc_priv, crp_waitq_link); + D_MUTEX_UNLOCK(&crt_ctx->cc_mutex); + + if (tmp_rpc != NULL) + dispatch_rpc(tmp_rpc); + else + put_quota_resource(rpc_priv->crp_pub.cr_ctx, CRT_QUOTA_RPCS); + crt_context_req_untrack_internal(rpc_priv); - /* done if flow control disabled */ + /* done if ep credit flow control is disabled */ if (crt_gdata.cg_credit_ep_ctx == 0) return; @@ -1408,20 +1468,8 @@ crt_context_req_untrack(struct crt_rpc_priv *rpc_priv) D_MUTEX_UNLOCK(&epi->epi_mutex); /* re-submit the rpc req */ - while ((tmp_rpc = d_list_pop_entry(&submit_list, struct crt_rpc_priv, crp_tmp_link))) { - crt_rpc_lock(tmp_rpc); - rc = crt_req_send_internal(tmp_rpc); - if (rc == 0) { - crt_rpc_unlock(tmp_rpc); - } else { - RPC_ADDREF(tmp_rpc); - RPC_ERROR(tmp_rpc, "crt_req_send_internal failed, rc: %d\n", rc); - tmp_rpc->crp_state = RPC_STATE_INITED; - crt_context_req_untrack_internal(tmp_rpc); - /* for error case here */ - crt_rpc_complete_and_unlock(tmp_rpc, rc); - } - } + while ((tmp_rpc = d_list_pop_entry(&submit_list, struct crt_rpc_priv, crp_tmp_link))) + dispatch_rpc(tmp_rpc); } /* TODO: Need per-provider call */ @@ -1910,3 +1958,134 @@ crt_req_force_completion(struct crt_rpc_priv *rpc_priv) crt_req_timeout_track(rpc_priv); D_MUTEX_UNLOCK(&crt_ctx->cc_mutex); } + +static int +context_quotas_init(crt_context_t crt_ctx) +{ + struct crt_context *ctx = crt_ctx; + struct crt_quotas *quotas; + int rc = 0; + + if (ctx == NULL) { + D_ERROR("NULL context\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + quotas = &ctx->cc_quotas; + + quotas->limit[CRT_QUOTA_RPCS] = crt_gdata.cg_rpc_quota; + quotas->current[CRT_QUOTA_RPCS] = 0; + quotas->enabled[CRT_QUOTA_RPCS] = crt_gdata.cg_rpc_quota > 0 ? true : false; +out: + return rc; +} + +static int +context_quotas_finalize(crt_context_t crt_ctx) +{ + struct crt_context *ctx = crt_ctx; + + if (ctx == NULL) { + D_ERROR("NULL context\n"); + return -DER_INVAL; + } + + for (int i = 0; i < CRT_QUOTA_COUNT; i++) + ctx->cc_quotas.enabled[i] = false; + + return DER_SUCCESS; +} + +int +crt_context_quota_limit_set(crt_context_t crt_ctx, crt_quota_type_t quota, int value) +{ + struct crt_context *ctx = crt_ctx; + int rc = 0; + + if (ctx == NULL) { + D_ERROR("NULL context\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + if (quota < 0 || quota >= CRT_QUOTA_COUNT) { + D_ERROR("Invalid quota %d passed\n", quota); + D_GOTO(out, rc = -DER_INVAL); + } + + D_MUTEX_LOCK(&ctx->cc_quotas.mutex); + ctx->cc_quotas.limit[quota] = value; + D_MUTEX_UNLOCK(&ctx->cc_quotas.mutex); + +out: + return rc; +} + +int +crt_context_quota_limit_get(crt_context_t crt_ctx, crt_quota_type_t quota, int *value) +{ + struct crt_context *ctx = crt_ctx; + int rc = 0; + + if (ctx == NULL) { + D_ERROR("NULL context\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + if (quota < 0 || quota >= CRT_QUOTA_COUNT) { + D_ERROR("Invalid quota %d passed\n", quota); + D_GOTO(out, rc = -DER_INVAL); + } + + if (value == NULL) { + D_ERROR("NULL value\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + *value = ctx->cc_quotas.limit[quota]; + +out: + return rc; +} + +static inline int +get_quota_resource(crt_context_t crt_ctx, crt_quota_type_t quota) +{ + struct crt_context *ctx = crt_ctx; + int rc = 0; + + D_ASSERTF(ctx != NULL, "NULL context\n"); + D_ASSERTF(quota >= 0 && quota < CRT_QUOTA_COUNT, "Invalid quota\n"); + + /* If quotas not enabled or unlimited quota */ + if (!ctx->cc_quotas.enabled[quota] || ctx->cc_quotas.limit[quota] == 0) + return 0; + + /* It's ok if we go slightly above quota in a corner case, but avoid locks */ + if (ctx->cc_quotas.current[quota] < ctx->cc_quotas.limit[quota]) { + atomic_fetch_add(&ctx->cc_quotas.current[quota], 1); + } else { + D_DEBUG(DB_TRACE, "Quota limit (%d) reached for quota_type=%d\n", + ctx->cc_quotas.limit[quota], quota); + rc = -DER_QUOTA_LIMIT; + } + + return rc; +} + +static inline void +put_quota_resource(crt_context_t crt_ctx, crt_quota_type_t quota) +{ + struct crt_context *ctx = crt_ctx; + + D_ASSERTF(ctx != NULL, "NULL context\n"); + D_ASSERTF(quota >= 0 && quota < CRT_QUOTA_COUNT, "Invalid quota\n"); + + /* If quotas not enabled or unlimited quota */ + if (!ctx->cc_quotas.enabled[quota] || ctx->cc_quotas.limit[quota] == 0) + return; + + D_ASSERTF(ctx->cc_quotas.current[quota] > 0, "Invalid current limit"); + atomic_fetch_sub(&ctx->cc_quotas.current[quota], 1); + + return; +} diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 18573f95e16..6c3805a78c9 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1400,7 +1400,7 @@ crt_hg_req_send_cb(const struct hg_cb_info *hg_cbinfo) void crt_hg_req_send(struct crt_rpc_priv *rpc_priv) { - hg_return_t hg_ret; + hg_return_t hg_ret; D_ASSERT(rpc_priv != NULL); diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index ce580b1fe68..8359afa4903 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -96,6 +96,7 @@ dump_envariables(void) "D_PORT_AUTO_ADJUST", "D_POLL_TIMEOUT", "D_LOG_FILE_APPEND_RANK", + "D_QUOTA_RPCS", "D_POST_INIT", "D_POST_INCR", "DAOS_SIGNAL_REGISTER"}; @@ -191,7 +192,7 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, /* Set max number of contexts. Defaults to the number of cores */ ctx_num = 0; - d_getenv_int("CRT_CTX_NUM", &ctx_num); + d_getenv_uint("CRT_CTX_NUM", &ctx_num); if (opt) max_num_ctx = ctx_num ? ctx_num : max(crt_gdata.cg_num_cores, opt->cio_ctx_max_num); else @@ -220,7 +221,7 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, if (share_addr) { set_sep = true; ctx_num = 0; - d_getenv_int("CRT_CTX_NUM", &ctx_num); + d_getenv_uint("CRT_CTX_NUM", &ctx_num); max_num_ctx = ctx_num; } } @@ -257,6 +258,7 @@ prov_data_init(struct crt_prov_gdata *prov_data, crt_provider_t provider, return DER_SUCCESS; } + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) { @@ -277,15 +279,15 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_rpcid, crt_gdata.cg_num_cores); /* Set context post init / post incr to tune number of pre-posted recvs */ - d_getenv_int("D_POST_INIT", &post_init); + d_getenv_uint32_t("D_POST_INIT", &post_init); crt_gdata.cg_post_init = post_init; - d_getenv_int("D_POST_INCR", &post_incr); + d_getenv_uint32_t("D_POST_INCR", &post_incr); crt_gdata.cg_post_incr = post_incr; is_secondary = 0; /* Apply CART-890 workaround for server side only */ if (server) { - d_getenv_int("CRT_ENABLE_MEM_PIN", &mem_pin_enable); + d_getenv_uint("CRT_ENABLE_MEM_PIN", &mem_pin_enable); if (mem_pin_enable == 1) mem_pin_workaround(); } else { @@ -293,14 +295,14 @@ static int data_init(int server, crt_init_options_t *opt) * Client-side envariable to indicate that the cluster * is running using a secondary provider */ - d_getenv_int("CRT_SECONDARY_PROVIDER", &is_secondary); + d_getenv_uint("CRT_SECONDARY_PROVIDER", &is_secondary); } crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; if (opt && opt->cio_crt_timeout != 0) timeout = opt->cio_crt_timeout; else - d_getenv_int("CRT_TIMEOUT", &timeout); + d_getenv_uint("CRT_TIMEOUT", &timeout); if (timeout == 0 || timeout > 3600) crt_gdata.cg_timeout = CRT_DEFAULT_TIMEOUT_S; @@ -319,9 +321,14 @@ static int data_init(int server, crt_init_options_t *opt) credits = opt->cio_ep_credits; } else { credits = CRT_DEFAULT_CREDITS_PER_EP_CTX; - d_getenv_int("CRT_CREDIT_EP_CTX", &credits); + d_getenv_uint("CRT_CREDIT_EP_CTX", &credits); } + /* Enable quotas by default only on clients */ + crt_gdata.cg_rpc_quota = server ? 0 : CRT_QUOTA_RPCS_DEFAULT; + + d_getenv_uint("D_QUOTA_RPCS", &crt_gdata.cg_rpc_quota); + /* Must be set on the server when using UCX, will not affect OFI */ d_getenv_char("UCX_IB_FORK_INIT", &ucx_ib_fork_init); if (ucx_ib_fork_init) { @@ -332,13 +339,13 @@ static int data_init(int server, crt_init_options_t *opt) } } if (server) - setenv("UCX_IB_FORK_INIT", "n", 1); + d_setenv("UCX_IB_FORK_INIT", "n", 1); /* This is a workaround for CART-871 if universe size is not set */ - d_getenv_int("FI_UNIVERSE_SIZE", &fi_univ_size); + d_getenv_uint("FI_UNIVERSE_SIZE", &fi_univ_size); if (fi_univ_size == 0) { D_INFO("FI_UNIVERSE_SIZE was not set; setting to 2048\n"); - setenv("FI_UNIVERSE_SIZE", "2048", 1); + d_setenv("FI_UNIVERSE_SIZE", "2048", 1); } if (credits == 0) { @@ -529,19 +536,6 @@ check_grpid(crt_group_id_t grpid) return rc; } -static void -apply_if_not_set(const char *env_name, const char *new_value) -{ - char *old_val; - - old_val = getenv(env_name); - - if (old_val == NULL) { - D_INFO("%s not set, setting to %s\n", env_name, new_value); - setenv(env_name, new_value, true); - } -} - static void prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt) { @@ -562,26 +556,25 @@ prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt) if (prov == CRT_PROV_OFI_VERBS_RXM || prov == CRT_PROV_OFI_TCP_RXM) { /* Use shared receive queues to avoid large mem consumption */ - apply_if_not_set("FI_OFI_RXM_USE_SRX", "1"); + d_setenv("FI_OFI_RXM_USE_SRX", "1", 0); /* Only apply on the server side */ if (prov == CRT_PROV_OFI_TCP_RXM && crt_is_service()) - apply_if_not_set("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd"); - + d_setenv("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd", 0); } if (prov == CRT_PROV_OFI_CXI) mrc_enable = 1; - d_getenv_int("CRT_MRC_ENABLE", &mrc_enable); + d_getenv_uint("CRT_MRC_ENABLE", &mrc_enable); if (mrc_enable == 0) { D_INFO("Disabling MR CACHE (FI_MR_CACHE_MAX_COUNT=0)\n"); - setenv("FI_MR_CACHE_MAX_COUNT", "0", 1); + d_setenv("FI_MR_CACHE_MAX_COUNT", "0", 1); } /* Use tagged messages for other providers, disable multi-recv */ if (prov != CRT_PROV_OFI_CXI && prov != CRT_PROV_OFI_TCP) - apply_if_not_set("NA_OFI_UNEXPECTED_TAG_MSG", "1"); + d_setenv("NA_OFI_UNEXPECTED_TAG_MSG", "1", 0); g_prov_settings_applied[prov] = true; } diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index e9faa7607cd..2e71e28b693 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -88,7 +88,7 @@ struct crt_gdata { /** Provider specific data */ struct crt_prov_gdata cg_prov_gdata_primary; - /** */ + /** Placeholder for secondary provider data */ struct crt_prov_gdata *cg_prov_gdata_secondary; /** Hints to mercury for request post init (ignored for clients) */ @@ -110,6 +110,7 @@ struct crt_gdata { /** HG level global data */ struct crt_hg_gdata *cg_hg; + /** Points to default group */ struct crt_grp_gdata *cg_grp; /** refcount to protect crt_init/crt_finalize */ @@ -145,6 +146,8 @@ struct crt_gdata { struct d_tm_node_t *cg_uri_other; /** Number of cores on a system */ long cg_num_cores; + /** Inflight rpc quota limit */ + uint32_t cg_rpc_quota; }; extern struct crt_gdata crt_gdata; @@ -189,6 +192,14 @@ extern struct crt_plugin_gdata crt_plugin_gdata; #define CRT_DEFAULT_CREDITS_PER_EP_CTX (32) #define CRT_MAX_CREDITS_PER_EP_CTX (256) +struct crt_quotas { + int limit[CRT_QUOTA_COUNT]; + ATOMIC uint32_t current[CRT_QUOTA_COUNT]; + bool enabled[CRT_QUOTA_COUNT]; + pthread_mutex_t mutex; + d_list_t rpc_waitq; +}; + /* crt_context */ struct crt_context { d_list_t cc_link; /** link to gdata.cg_ctx_list */ @@ -227,6 +238,9 @@ struct crt_context { /** Stores self uri for the current context */ char cc_self_uri[CRT_ADDR_STR_MAX_LEN]; + + /** Stores quotas */ + struct crt_quotas cc_quotas; }; /* in-flight RPC req list, be tracked per endpoint for every crt_context */ diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index 597213c2b89..af3226facd8 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -19,7 +19,7 @@ #define D_LOGFAC DD_FAC(iv) #include "crt_internal.h" -#include "cart/iv.h" +#include #define IV_DBG(key, msg, ...) \ D_DEBUG(DB_TRACE, "[key=%p] " msg, (key)->iov_buf, ##__VA_ARGS__) @@ -1695,10 +1695,6 @@ crt_iv_fetch(crt_iv_namespace_t ivns, uint32_t class_id, /* The fetch info is contained on current server. */ if (rc == 0) { - /* Finish up the completion call back */ - iv_ops->ivo_on_refresh(ivns_internal, iv_key, 0, - iv_value, false, 0x0, user_priv); - fetch_comp_cb(ivns_internal, class_id, iv_key, NULL, iv_value, rc, cb_arg); @@ -1710,9 +1706,6 @@ crt_iv_fetch(crt_iv_namespace_t ivns, uint32_t class_id, return rc; } else if (rc != -DER_IVCB_FORWARD) { /* We got error, call the callback and exit */ - iv_ops->ivo_on_refresh(ivns_internal, iv_key, 0, - NULL, false, rc, user_priv); - fetch_comp_cb(ivns_internal, class_id, iv_key, NULL, NULL, rc, cb_arg); diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 5b680c4b6e8..62efb48e8e3 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -264,7 +264,8 @@ crt_opc_decode(crt_opcode_t crt_opc, char **module_name, char **opc_name) /* Redefining X macro allows to reuse existing lists */ #define X(a, ...) \ case a: \ - opc = #a; + opc = #a; \ + break; /* Next find the opcode name if available for the module */ if (cart_module) { @@ -656,9 +657,9 @@ int crt_req_create(crt_context_t crt_ctx, crt_endpoint_t *tgt_ep, crt_opcode_t opc, crt_rpc_t **req) { - int rc = 0; - struct crt_grp_priv *grp_priv = NULL; + struct crt_grp_priv *grp_priv = NULL; struct crt_rpc_priv *rpc_priv; + int rc = 0; if (crt_ctx == CRT_CONTEXT_NULL || req == NULL) { D_ERROR("invalid parameter (NULL crt_ctx or req).\n"); diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index a1d466c6c67..8f56e5e0c81 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -12,12 +12,14 @@ #define __CRT_RPC_H__ #include -#include "gurt/common.h" +#include /* default RPC timeout 60 seconds */ #define CRT_DEFAULT_TIMEOUT_S (60) /* second */ #define CRT_DEFAULT_TIMEOUT_US (CRT_DEFAULT_TIMEOUT_S * 1e6) /* micro-second */ +#define CRT_QUOTA_RPCS_DEFAULT 64 + /* uri lookup max retry times */ #define CRT_URI_LOOKUP_RETRY_MAX (8) @@ -130,6 +132,8 @@ struct crt_rpc_priv { d_list_t crp_epi_link; /* tmp_link used in crt_context_req_untrack */ d_list_t crp_tmp_link; + /* link for crt_context::cc_quotas.rpc_waitq */ + d_list_t crp_waitq_link; /* link to parent RPC crp_opc_info->co_child_rpcs/co_replied_rpcs */ d_list_t crp_parent_link; /* binheap node for timeout management, in crt_context::cc_bh_timeout */ diff --git a/src/cart/crt_swim.h b/src/cart/crt_swim.h index ecf04e1a406..7788f2822b0 100644 --- a/src/cart/crt_swim.h +++ b/src/cart/crt_swim.h @@ -10,8 +10,8 @@ #ifndef __CRT_SWIM_H__ #define __CRT_SWIM_H__ -#include "gurt/list.h" -#include "cart/swim.h" +#include +#include #include "swim/swim_internal.h" #define CRT_SWIM_NGLITCHES_TRESHOLD 10 diff --git a/src/cart/swim/swim.c b/src/cart/swim/swim.c index ed130a08c62..bd8c417a445 100644 --- a/src/cart/swim/swim.c +++ b/src/cart/swim/swim.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2016 UChicago Argonne, LLC - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -25,7 +25,7 @@ swim_prot_period_len_default(void) { unsigned int val = SWIM_PROTOCOL_PERIOD_LEN; - d_getenv_int("SWIM_PROTOCOL_PERIOD_LEN", &val); + d_getenv_uint("SWIM_PROTOCOL_PERIOD_LEN", &val); return val; } @@ -34,7 +34,7 @@ swim_suspect_timeout_default(void) { unsigned int val = SWIM_SUSPECT_TIMEOUT; - d_getenv_int("SWIM_SUSPECT_TIMEOUT", &val); + d_getenv_uint("SWIM_SUSPECT_TIMEOUT", &val); return val; } @@ -43,7 +43,7 @@ swim_ping_timeout_default(void) { unsigned int val = SWIM_PING_TIMEOUT; - d_getenv_int("SWIM_PING_TIMEOUT", &val); + d_getenv_uint("SWIM_PING_TIMEOUT", &val); return val; } diff --git a/src/cart/utils/crt_utils.c b/src/cart/utils/crt_utils.c index c65aa284a08..5e1a7582c90 100644 --- a/src/cart/utils/crt_utils.c +++ b/src/cart/utils/crt_utils.c @@ -434,20 +434,20 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) /* These two are always set */ D_INFO("setenv CRT_PHY_ADDR_STR=%s\n", crt_net_cfg_info.provider); - rc = setenv("CRT_PHY_ADDR_STR", crt_net_cfg_info.provider, 1); + rc = d_setenv("CRT_PHY_ADDR_STR", crt_net_cfg_info.provider, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); sprintf(buf, "%d", crt_net_cfg_info.crt_ctx_share_addr); D_INFO("setenv CRT_CTX_SHARE_ADDR=%d\n", crt_net_cfg_info.crt_ctx_share_addr); - rc = setenv("CRT_CTX_SHARE_ADDR", buf, 1); + rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); /* If the server has set this, the client must use the same value. */ if (crt_net_cfg_info.srv_srx_set != -1) { sprintf(buf, "%d", crt_net_cfg_info.srv_srx_set); - rc = setenv("FI_OFI_RXM_USE_SRX", buf, 1); + rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1); D_INFO("setenv FI_OFI_RXM_USE_SRX=%d\n", crt_net_cfg_info.srv_srx_set); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -467,7 +467,7 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) crt_timeout = getenv("CRT_TIMEOUT"); if (!crt_timeout) { sprintf(buf, "%d", crt_net_cfg_info.crt_timeout); - rc = setenv("CRT_TIMEOUT", buf, 1); + rc = d_setenv("CRT_TIMEOUT", buf, 1); D_INFO("setenv CRT_TIMEOUT=%d\n", crt_net_cfg_info.crt_timeout); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -477,7 +477,7 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) ofi_interface = getenv("OFI_INTERFACE"); if (!ofi_interface) { - rc = setenv("OFI_INTERFACE", crt_net_cfg_info.interface, 1); + rc = d_setenv("OFI_INTERFACE", crt_net_cfg_info.interface, 1); D_INFO("Setting OFI_INTERFACE=%s\n", crt_net_cfg_info.interface); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -489,7 +489,7 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) ofi_domain = getenv("OFI_DOMAIN"); if (!ofi_domain) { - rc = setenv("OFI_DOMAIN", crt_net_cfg_info.domain, 1); + rc = d_setenv("OFI_DOMAIN", crt_net_cfg_info.domain, 1); D_INFO("Setting OFI_DOMAIN=%s\n", crt_net_cfg_info.domain); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); diff --git a/src/client/api/event.c b/src/client/api/event.c index 2ab910ac7bf..fa31cfd5ac4 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -97,7 +97,7 @@ daos_eq_lib_init() eq_ref = 1; - d_getenv_int("D_POLL_TIMEOUT", &ev_prog_timeout); + d_getenv_uint32_t("D_POLL_TIMEOUT", &ev_prog_timeout); unlock: D_MUTEX_UNLOCK(&daos_eq_lock); diff --git a/src/client/api/tests/eq_tests.c b/src/client/api/tests/eq_tests.c index 36939544403..fc7daf257fc 100644 --- a/src/client/api/tests/eq_tests.c +++ b/src/client/api/tests/eq_tests.c @@ -1242,8 +1242,8 @@ eq_ut_setup(void **state) { int rc; - setenv("OFI_INTERFACE", "lo", 1); - setenv("D_PROVIDER", "ofi+tcp", 1); + d_setenv("OFI_INTERFACE", "lo", 1); + d_setenv("D_PROVIDER", "ofi+tcp", 1); rc = daos_debug_init(DAOS_LOG_DEFAULT); if (rc != 0) { diff --git a/src/client/dfs/dfs.c b/src/client/dfs/dfs.c index 22182c59f45..a95acd9d31b 100644 --- a/src/client/dfs/dfs.c +++ b/src/client/dfs/dfs.c @@ -18,8 +18,8 @@ #include #include -#include "daos.h" -#include "daos_fs.h" +#include +#include #include "dfs_internal.h" /** D-key name of SB metadata */ diff --git a/src/client/dfs/dfs_sys.c b/src/client/dfs/dfs_sys.c index 72df7fe134e..13bf3c05999 100644 --- a/src/client/dfs/dfs_sys.c +++ b/src/client/dfs/dfs_sys.c @@ -14,10 +14,10 @@ #include -#include "daos.h" -#include "daos_fs.h" +#include +#include -#include "daos_fs_sys.h" +#include /** Number of entries for readdir */ #define DFS_SYS_NUM_DIRENTS 24 diff --git a/src/client/dfs/duns.c b/src/client/dfs/duns.c index 6c0e089bd58..fadd2f1f769 100644 --- a/src/client/dfs/duns.c +++ b/src/client/dfs/duns.c @@ -26,11 +26,11 @@ #endif #include #include -#include "dfuse_ioctl.h" -#include "daos_types.h" -#include "daos.h" -#include "daos_fs.h" -#include "daos_uns.h" +#include +#include +#include +#include +#include #ifndef FUSE_SUPER_MAGIC #define FUSE_SUPER_MAGIC 0x65735546 diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 93f3b414c62..401aee63d4b 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -17,8 +17,8 @@ #include #include -#include "daos.h" -#include "daos_fs.h" +#include +#include #include "dfs_internal.h" diff --git a/src/client/dfuse/dfuse_cont.c b/src/client/dfuse/dfuse_cont.c index bb34bfecc4d..888c6ef52f1 100644 --- a/src/client/dfuse/dfuse_cont.c +++ b/src/client/dfuse/dfuse_cont.c @@ -6,8 +6,8 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_fs.h" -#include "daos_api.h" +#include +#include /* Lookup a container within a pool */ void diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c index 84b347a077f..1ef48600a6f 100644 --- a/src/client/dfuse/dfuse_main.c +++ b/src/client/dfuse/dfuse_main.c @@ -19,9 +19,9 @@ #include "dfuse.h" -#include "daos_fs.h" -#include "daos_api.h" -#include "daos_uns.h" +#include +#include +#include #include /* Signal handler for SIGCHLD, it doesn't need to do anything, but it's diff --git a/src/client/dfuse/dfuse_pool.c b/src/client/dfuse/dfuse_pool.c index d0172b98ed8..745d2c9b7bf 100644 --- a/src/client/dfuse/dfuse_pool.c +++ b/src/client/dfuse/dfuse_pool.c @@ -6,9 +6,9 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_fs.h" -#include "daos_api.h" -#include "daos_security.h" +#include +#include +#include /* Lookup a pool */ void diff --git a/src/client/dfuse/il/int_posix.c b/src/client/dfuse/il/int_posix.c index 464ea687247..d9229bf6f4b 100644 --- a/src/client/dfuse/il/int_posix.c +++ b/src/client/dfuse/il/int_posix.c @@ -21,12 +21,14 @@ #include #include -#include -#include "dfuse_log.h" #include #include + +#include +#include + +#include "dfuse_log.h" #include "intercept.h" -#include "dfuse_ioctl.h" #include "dfuse_vector.h" #include "dfuse_common.h" @@ -2178,7 +2180,7 @@ dfuse_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) if (nread != nmemb) entry->fd_eof = true; } else if (bytes_read < 0) { - entry->fd_err = bytes_read; + entry->fd_err = errcode; } else { entry->fd_eof = true; } @@ -2237,7 +2239,7 @@ dfuse_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) nwrite = bytes_written / size; entry->fd_pos = oldpos + (nwrite * size); } else if (bytes_written < 0) { - entry->fd_err = bytes_written; + entry->fd_err = errcode; } vector_decref(&fd_table, entry); diff --git a/src/client/dfuse/il/int_read.c b/src/client/dfuse/il/int_read.c index 497e39273ab..24f6be3051d 100644 --- a/src/client/dfuse/il/int_read.c +++ b/src/client/dfuse/il/int_read.c @@ -7,8 +7,8 @@ #define D_LOGFAC DD_FAC(il) #include "dfuse_common.h" #include "intercept.h" -#include "daos.h" -#include "daos_array.h" +#include +#include #include "ioil.h" diff --git a/src/client/dfuse/il/int_write.c b/src/client/dfuse/il/int_write.c index abbb573638d..2de4b3a4460 100644 --- a/src/client/dfuse/il/int_write.c +++ b/src/client/dfuse/il/int_write.c @@ -5,10 +5,12 @@ */ #define D_LOGFAC DD_FAC(il) + +#include +#include + #include "dfuse_common.h" #include "intercept.h" -#include "daos.h" -#include "daos_array.h" #include "ioil.h" diff --git a/src/client/dfuse/il/ioil.h b/src/client/dfuse/il/ioil.h index b9581b3bd77..f765ee66626 100644 --- a/src/client/dfuse/il/ioil.h +++ b/src/client/dfuse/il/ioil.h @@ -11,7 +11,7 @@ #include #include -#include "daos_fs.h" +#include struct ioil_cont { /* Container open handle */ diff --git a/src/client/dfuse/ops/getxattr.c b/src/client/dfuse/ops/getxattr.c index f24b3943e0e..e9e32c0b463 100644 --- a/src/client/dfuse/ops/getxattr.c +++ b/src/client/dfuse/ops/getxattr.c @@ -7,7 +7,7 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_uns.h" +#include static int _dfuse_attr_create(char *type, uuid_t pool, uuid_t cont, char **_value, daos_size_t *_out_size) diff --git a/src/client/dfuse/ops/ioctl.c b/src/client/dfuse/ops/ioctl.c index 5f2b06e34e1..574279027c1 100644 --- a/src/client/dfuse/ops/ioctl.c +++ b/src/client/dfuse/ops/ioctl.c @@ -9,7 +9,7 @@ #include -#include "dfuse_ioctl.h" +#include #define MAX_IOCTL_SIZE ((1024 * 16) - 1) diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index 913b987b368..f47e88986cd 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -7,7 +7,7 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_uns.h" +#include char *duns_xattr_name = DUNS_XATTR_NAME; diff --git a/src/client/dfuse/ops/readdir.c b/src/client/dfuse/ops/readdir.c index d8df6657861..91bf815e09e 100644 --- a/src/client/dfuse/ops/readdir.c +++ b/src/client/dfuse/ops/readdir.c @@ -7,7 +7,7 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_uns.h" +#include /* Initial number of dentries to read when doing readdirplus */ #define READDIR_PLUS_COUNT 26 diff --git a/src/client/dfuse/ops/setxattr.c b/src/client/dfuse/ops/setxattr.c index 9de78581f24..e50c010b377 100644 --- a/src/client/dfuse/ops/setxattr.c +++ b/src/client/dfuse/ops/setxattr.c @@ -7,7 +7,7 @@ #include "dfuse_common.h" #include "dfuse.h" -#include "daos_uns.h" +#include #define ACL_ACCESS "system.posix_acl_access" #define ACL_DEFAULT "system.posix_acl_default" diff --git a/src/client/ds3/ds3_internal.h b/src/client/ds3/ds3_internal.h index 1df7b3586de..83ca23f0e75 100644 --- a/src/client/ds3/ds3_internal.h +++ b/src/client/ds3/ds3_internal.h @@ -10,9 +10,9 @@ #define __DAOS_S3_INTERNAL_H__ #include -#include "daos.h" -#include "daos_fs.h" -#include "daos_s3.h" +#include +#include +#include #include #define METADATA_BUCKET "_METADATA" diff --git a/src/common/mem.c b/src/common/mem.c index bd07ff8ab7e..4047356f987 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -61,7 +61,7 @@ umempobj_settings_init(bool md_on_ssd) return rc; } - d_getenv_int("DAOS_MD_ON_SSD_MODE", &md_mode); + d_getenv_uint("DAOS_MD_ON_SSD_MODE", &md_mode); switch (md_mode) { case DAOS_MD_BMEM: diff --git a/src/common/misc.c b/src/common/misc.c index fa64a785747..f7d6b1ddad0 100644 --- a/src/common/misc.c +++ b/src/common/misc.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -656,7 +656,7 @@ daos_crt_init_opt_get(bool server, int ctx_nr) daos_crt_init_opt.cio_use_sensors = server; /** configure cart for maximum bulk threshold */ - d_getenv_int("DAOS_RPC_SIZE_LIMIT", &limit); + d_getenv_uint32_t("DAOS_RPC_SIZE_LIMIT", &limit); daos_crt_init_opt.cio_use_expected_size = 1; daos_crt_init_opt.cio_max_expected_size = limit ? limit : DAOS_RPC_SIZE; diff --git a/src/container/container_iv.c b/src/container/container_iv.c index e7ae943999c..07c023f3eae 100644 --- a/src/container/container_iv.c +++ b/src/container/container_iv.c @@ -247,6 +247,12 @@ cont_iv_snap_ent_create(struct ds_iv_entry *entry, struct ds_iv_key *key) rc = dbtree_update(root_hdl, &key_iov, &val_iov); if (rc) D_GOTO(out, rc); + + rc = ds_cont_tgt_snapshots_update(entry->ns->iv_pool_uuid, + civ_key->cont_uuid, + snaps, snap_cnt); + if (rc) + D_GOTO(out, rc); out: D_FREE(iv_entry); D_FREE(snaps); @@ -426,6 +432,10 @@ cont_iv_prop_ent_create(struct ds_iv_entry *entry, struct ds_iv_key *key) rc = dbtree_update(root_hdl, &key_iov, &val_iov); if (rc) D_GOTO(out, rc); + + rc = ds_cont_tgt_prop_update(entry->ns->iv_pool_uuid, civ_key->cont_uuid, prop); + if (rc) + D_GOTO(out, rc); out: if (prop != NULL) daos_prop_free(prop); @@ -461,7 +471,7 @@ cont_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key, rc = cont_iv_snap_ent_create(entry, key); if (rc == 0) goto again; - D_ERROR("create cont snap iv entry failed " + D_DEBUG(DB_MD, "create cont snap iv entry failed " ""DF_RC"\n", DP_RC(rc)); } else if (class_id == IV_CONT_PROP) { rc = cont_iv_prop_ent_create(entry, key); @@ -763,8 +773,8 @@ cont_iv_fetch(void *ns, int class_id, uuid_t key_uuid, civ_key->entry_size = entry_size; rc = ds_iv_fetch(ns, &key, cont_iv ? &sgl : NULL, retry); if (rc) - DL_CDEBUG(rc == -DER_NOTLEADER, DB_MGMT, DLOG_ERR, rc, DF_UUID " iv fetch failed", - DP_UUID(key_uuid)); + D_DEBUG(DB_MGMT, DF_UUID " iv fetch failed: %d", + DP_UUID(key_uuid), rc); return rc; } diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 751f54b6191..938970f6232 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -25,8 +25,8 @@ #include "rpc.h" #include "srv_internal.h" #include "srv_layout.h" -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_producer.h" +#include +#include #define DAOS_POOL_GLOBAL_VERSION_WITH_CONT_MDTIMES 2 #define DAOS_POOL_GLOBAL_VERSION_WITH_CONT_NHANDLES 2 @@ -1818,6 +1818,7 @@ cont_agg_eph_leader_ult(void *arg) struct ds_pool *pool = svc->cs_pool; struct cont_ec_agg *ec_agg; struct cont_ec_agg *tmp; + uint64_t cur_eph, new_eph; int rc = 0; if (svc->cs_ec_leader_ephs_req == NULL) @@ -1874,11 +1875,18 @@ cont_agg_eph_leader_ult(void *arg) * server might cause the minimum epoch is less than * ea_current_eph. */ - D_DEBUG(DB_MD, DF_CONT" minimum "DF_U64" current " - DF_U64"\n", - DP_CONT(svc->cs_pool_uuid, - ec_agg->ea_cont_uuid), + D_DEBUG(DB_MD, DF_CONT" minimum "DF_U64" current "DF_U64"\n", + DP_CONT(svc->cs_pool_uuid, ec_agg->ea_cont_uuid), min_eph, ec_agg->ea_current_eph); + + cur_eph = d_hlc2sec(ec_agg->ea_current_eph); + new_eph = d_hlc2sec(min_eph); + if (cur_eph && new_eph > cur_eph && (new_eph - cur_eph) >= 600) + D_WARN(DF_CONT": Sluggish EC boundary reporting. " + "cur:"DF_U64" new:"DF_U64" gap:"DF_U64"\n", + DP_CONT(svc->cs_pool_uuid, ec_agg->ea_cont_uuid), + cur_eph, new_eph, new_eph - cur_eph); + rc = cont_iv_ec_agg_eph_refresh(pool->sp_iv_ns, ec_agg->ea_cont_uuid, min_eph); diff --git a/src/container/srv_target.c b/src/container/srv_target.c index dd6fe8008ad..83b1846bead 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -50,6 +50,7 @@ agg_rate_ctl(void *arg) struct ds_cont_child *cont = param->ap_cont; struct ds_pool *pool = cont->sc_pool->spc_pool; struct sched_request *req = cont2req(cont, param->ap_vos_agg); + uint32_t msecs; /* Abort current round of aggregation */ if (dss_ult_exiting(req) || pool->sp_reclaim == DAOS_RECLAIM_DISABLED) @@ -62,28 +63,16 @@ agg_rate_ctl(void *arg) if (pool->sp_rebuilding && cont->sc_ec_agg_active && !param->ap_vos_agg) return -1; - /* System is idle, let aggregation run in tight mode */ - if (!dss_xstream_is_busy()) { + /* When system is idle or under space pressure, let aggregation run in tight mode */ + if (!dss_xstream_is_busy() || sched_req_space_check(req) != SCHED_SPACE_PRESS_NONE) { sched_req_yield(req); return 0; } - /* - * When it's under space pressure, aggregation will continue run in slack - * mode no matter what reclaim policy is used, otherwise, it'll take an extra - * sleep to minimize the performance impact. - */ - if (sched_req_space_check(req) == SCHED_SPACE_PRESS_NONE) { - uint32_t msecs; - - /* Sleep 2 seconds in lazy mode, it's kind of pausing aggregation */ - msecs = (pool->sp_reclaim == DAOS_RECLAIM_LAZY) ? 2000 : 50; - sched_req_sleep(req, msecs); - } else { - sched_req_yield(req); - } + msecs = (pool->sp_reclaim == DAOS_RECLAIM_LAZY) ? 1000 : 50; + sched_req_sleep(req, msecs); - /* System is busy, let aggregation run in slack mode */ + /* System is busy and no space pressure, let aggregation run in slack mode */ return 1; } @@ -239,6 +228,14 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, return false; } + /* + * EC aggregation must proceed no matter if the target is busy or not, + * otherwise, the global EC boundary won't be bumped promptly, and that + * will impact VOS aggregation on every target. + */ + if (!vos_agg) + return true; + if (pool->sp_reclaim == DAOS_RECLAIM_LAZY && dss_xstream_is_busy() && sched_req_space_check(req) == SCHED_SPACE_PRESS_NONE) { D_DEBUG(DB_EPC, "Pool reclaim strategy is lazy, service is " @@ -629,13 +626,18 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, rc = ABT_cond_create(&cont->sc_scrub_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_mutex; + goto out_resync_cond; + } + rc = ABT_cond_create(&cont->sc_rebuild_cond); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto out_scrub_cond; } cont->sc_pool = ds_pool_child_lookup(po_uuid); if (cont->sc_pool == NULL) { rc = -DER_NO_HDL; - goto out_cond; + goto out_rebuild_cond; } rc = vos_cont_open(cont->sc_pool->spc_hdl, co_uuid, &cont->sc_hdl); @@ -661,7 +663,11 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, out_pool: ds_pool_child_put(cont->sc_pool); -out_cond: +out_rebuild_cond: + ABT_cond_free(&cont->sc_rebuild_cond); +out_scrub_cond: + ABT_cond_free(&cont->sc_scrub_cond); +out_resync_cond: ABT_cond_free(&cont->sc_dtx_resync_cond); out_mutex: ABT_mutex_free(&cont->sc_mutex); @@ -688,6 +694,7 @@ cont_child_free_ref(struct daos_llink *llink) D_FREE(cont->sc_snapshots); ABT_cond_free(&cont->sc_dtx_resync_cond); ABT_cond_free(&cont->sc_scrub_cond); + ABT_cond_free(&cont->sc_rebuild_cond); ABT_mutex_free(&cont->sc_mutex); D_FREE(cont); } @@ -742,6 +749,12 @@ ds_cont_child_cache_destroy(struct daos_lru_cache *cache) daos_lru_cache_destroy(cache); } +static void +cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) +{ + daos_lru_ref_release(cache, &cont->sc_list); +} + /* * If create == false, then this is assumed to be a pure lookup. In this case, * -DER_NONEXIST is returned if the ds_cont_child object does not exist. @@ -776,12 +789,6 @@ cont_child_lookup(struct daos_lru_cache *cache, const uuid_t co_uuid, return 0; } -static void -cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) -{ - daos_lru_ref_release(cache, &cont->sc_list); -} - static inline bool cont_child_started(struct ds_cont_child *cont_child) { @@ -807,13 +814,13 @@ cont_child_stop(struct ds_cont_child *cont_child) /* Some ds_cont_child will only created by ds_cont_child_lookup(). * never be started at all */ + cont_child->sc_stopping = 1; if (cont_child_started(cont_child)) { D_DEBUG(DB_MD, DF_CONT"[%d]: Stopping container\n", DP_CONT(cont_child->sc_pool->spc_uuid, cont_child->sc_uuid), dss_get_module_info()->dmi_tgt_id); - cont_child->sc_stopping = 1; d_list_del_init(&cont_child->sc_link); dtx_cont_deregister(cont_child); @@ -843,6 +850,18 @@ ds_cont_child_stop_all(struct ds_pool_child *pool_child) } } +void +ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child) +{ + struct ds_cont_child *cont_child; + + D_DEBUG(DB_MD, DF_UUID"[%d]: reset all containers EC aggregate epoch.\n", + DP_UUID(pool_child->spc_uuid), dss_get_module_info()->dmi_tgt_id); + + d_list_for_each_entry(cont_child, &pool_child->spc_cont_list, sc_link) + cont_child->sc_ec_agg_eph = cont_child->sc_ec_agg_eph_boundary; +} + static int cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid, bool *started, struct ds_cont_child **cont_out) @@ -1154,6 +1173,7 @@ cont_child_destroy_one(void *vin) &cont); if (rc == -DER_NONEXIST) break; + if (rc != 0) D_GOTO(out_pool, rc); @@ -1178,7 +1198,7 @@ cont_child_destroy_one(void *vin) ABT_mutex_unlock(cont->sc_mutex); /* Give chance to DTX reindex ULT for exit. */ - if (unlikely(cont->sc_dtx_reindex)) + while (unlikely(cont->sc_dtx_reindex)) ABT_thread_yield(); /* Make sure checksum scrubbing has stopped */ @@ -1189,6 +1209,12 @@ cont_child_destroy_one(void *vin) } ABT_mutex_unlock(cont->sc_mutex); + /* Make sure rebuild has stopped */ + ABT_mutex_lock(cont->sc_mutex); + if (cont->sc_rebuilding) + ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex); + ABT_mutex_unlock(cont->sc_mutex); + retry_cnt++; if (retry_cnt > 1) { D_ERROR("container is still in-use: open %u, resync %s, reindex %s\n", @@ -1288,9 +1314,20 @@ ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid, struct ds_cont_child **ds_cont) { struct dsm_tls *tls = dsm_tls_get(); + int rc; + + rc = cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid, + true /* create */, ds_cont); + if (rc != 0) + return rc; + + if ((*ds_cont)->sc_stopping) { + cont_child_put(tls->dt_cont_cache, *ds_cont); + *ds_cont = NULL; + return -DER_SHUTDOWN; + } - return cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid, - true /* create */, ds_cont); + return 0; } /** @@ -1625,6 +1662,8 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, struct dss_coll_ops coll_ops = { 0 }; struct dss_coll_args coll_args = { 0 }; struct ds_pool *pool; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; int rc; /* Only for debugging purpose to compare srv_cont_hdl with cont_hdl_uuid */ @@ -1657,18 +1696,22 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, coll_args.ca_func_args = &arg; /* setting aggregator args */ - rc = ds_pool_get_failed_tgt_idx(pool_uuid, &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); - return rc; + goto out; } - rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; + } - if (rc != 0) { + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); + if (rc != 0) /* Once it exclude the target from the pool, since the target * might still in the cart group, so IV cont open might still * come to this target, especially if cont open/close will be @@ -1678,9 +1721,10 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n", DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid), DP_RC(rc)); - return rc; - } +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } diff --git a/src/control/cmd/daos/util.h b/src/control/cmd/daos/util.h index 25563e5e1dc..e58fdabea62 100644 --- a/src/control/cmd/daos/util.h +++ b/src/control/cmd/daos/util.h @@ -25,12 +25,12 @@ #include #include -#include "daos_types.h" -#include "daos_api.h" -#include "daos_fs.h" -#include "daos_uns.h" -#include "daos_mgmt.h" -#include "dfuse_ioctl.h" +#include +#include +#include +#include +#include +#include #include "daos_hdlr.h" diff --git a/src/control/cmd/daos_agent/start.go b/src/control/cmd/daos_agent/start.go index 5814c3f5250..791af47ba63 100644 --- a/src/control/cmd/daos_agent/start.go +++ b/src/control/cmd/daos_agent/start.go @@ -48,7 +48,7 @@ type startCmd struct { func (cmd *startCmd) Execute(_ []string) error { if err := common.CheckDupeProcess(); err != nil { - return err + cmd.Notice(err.Error()) } cmd.Infof("Starting %s (pid %d)", versionString(), os.Getpid()) @@ -123,8 +123,7 @@ func (cmd *startCmd) Execute(_ []string) error { drpcSrvStart := time.Now() err = drpcServer.Start(hwlocCtx) if err != nil { - cmd.Errorf("Unable to start socket server on %s: %v", sockPath, err) - return err + return errors.Wrap(err, "unable to start dRPC server") } cmd.Debugf("dRPC socket server started: %s", time.Since(drpcSrvStart)) diff --git a/src/control/cmd/dmg/pretty/storage_nvme.go b/src/control/cmd/dmg/pretty/storage_nvme.go index 11e07f9df13..fefc3eef285 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme.go +++ b/src/control/cmd/dmg/pretty/storage_nvme.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,6 +7,7 @@ package pretty import ( + "errors" "fmt" "io" "sort" @@ -283,6 +284,9 @@ func PrintNvmeMetaMap(hsm control.HostStorageMap, out io.Writer, opts ...PrintCo } for _, controller := range hss.HostStorage.NvmeDevices { + if controller == nil { + return errors.New("nil controller in NvmeDevices") + } if err := printNvmeControllerSummary(controller, out, opts...); err != nil { return err } @@ -292,6 +296,10 @@ func PrintNvmeMetaMap(hsm control.HostStorageMap, out io.Writer, opts ...PrintCo for _, device := range controller.SmdDevices { iw1 := txtfmt.NewIndentWriter(iw) + + // Attach parent controller details to SMD before printing. + device.Ctrlr = *controller + if err := printSmdDevice(device, iw1, opts...); err != nil { return err } diff --git a/src/control/cmd/dmg/pretty/storage_nvme_test.go b/src/control/cmd/dmg/pretty/storage_nvme_test.go index 60b0f4dcd5c..c75b0061e14 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme_test.go +++ b/src/control/cmd/dmg/pretty/storage_nvme_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -329,23 +329,30 @@ PCI:%s Model:%s FW:%s Socket:%d Capacity:%s } func TestPretty_PrintNVMetaMap(t *testing.T) { + mockNvmeController := func(idx int32) *storage.NvmeController { + c := storage.MockNvmeController(idx) + c.SmdDevices = []*storage.SmdDevice{ + storage.MockSmdDevice(nil, idx), + } + return c + } var ( - controllerA = storage.MockNvmeController(1) - controllerB = storage.MockNvmeController(2) - controllerC = storage.MockNvmeController(1) - controllerD = storage.MockNvmeController(2) - controllerE = storage.MockNvmeController(1) - controllerF = storage.MockNvmeController(2) + controllerA = mockNvmeController(1) + controllerB = mockNvmeController(2) + controllerC = mockNvmeController(1) + controllerD = mockNvmeController(2) + controllerE = mockNvmeController(1) + controllerF = mockNvmeController(2) ) controllerA.SmdDevices = nil controllerB.SmdDevices = nil controllerE.SmdDevices = []*storage.SmdDevice{ - storage.MockSmdDevice(controllerE.PciAddr, 0), - storage.MockSmdDevice(controllerE.PciAddr, 1), + storage.MockSmdDevice(nil, 0), + storage.MockSmdDevice(nil, 1), } controllerF.SmdDevices = []*storage.SmdDevice{ - storage.MockSmdDevice(controllerF.PciAddr, 2), - storage.MockSmdDevice(controllerF.PciAddr, 3), + storage.MockSmdDevice(nil, 2), + storage.MockSmdDevice(nil, 3), } for name, tc := range map[string]struct { hsm control.HostStorageMap diff --git a/src/control/common/proto/mocks.go b/src/control/common/proto/mocks.go index e56275a2bb0..9aee111bc4c 100644 --- a/src/control/common/proto/mocks.go +++ b/src/control/common/proto/mocks.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2019-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -28,15 +28,20 @@ func MockNvmeNamespace(varIdx ...int32) *ctlpb.NvmeController_Namespace { // MockSmdDevice is a mock protobuf SmdDevice message used in tests for // multiple packages. -func MockSmdDevice(parentTrAddr string, varIdx ...int32) *ctlpb.SmdDevice { - native := storage.MockSmdDevice(parentTrAddr, varIdx...) +func MockSmdDevice(c *storage.NvmeController, varIdx ...int32) *ctlpb.SmdDevice { + native := storage.MockSmdDevice(c, varIdx...) pb := new(SmdDevice) if err := pb.FromNative(native); err != nil { panic(err) } - return pb.AsProto() + pbSmdDevice := pb.AsProto() + if c == nil { + pbSmdDevice.Ctrlr = nil + } + + return pbSmdDevice } // MockNvmeHealth is a mock protobuf Health message used in tests for diff --git a/src/control/common/proto/types_test.go b/src/control/common/proto/types_test.go index d677eca1160..f1d3bbd201c 100644 --- a/src/control/common/proto/types_test.go +++ b/src/control/common/proto/types_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -45,16 +45,18 @@ func TestProto_ConvertNvmeHealth(t *testing.T) { } func TestProto_ConvertSmdDevice(t *testing.T) { - pb := MockSmdDevice("0000:80:00.0", 1) + c := storage.MockNvmeController() + pb := MockSmdDevice(c, 1) pb.Ctrlr.HealthStats = MockNvmeHealth(1) native, err := (*SmdDevice)(pb).ToNative() if err != nil { t.Fatal(err) } - expNative := storage.MockSmdDevice("0000:80:00.0", 1) + expNative := storage.MockSmdDevice(c, 1) expNative.Ctrlr.HealthStats = storage.MockNvmeHealth(1) - if diff := cmp.Diff(expNative, native, test.DefaultCmpOpts()...); diff != "" { + co := cmpopts.IgnoreFields(storage.NvmeController{}, "Serial") + if diff := cmp.Diff(expNative, native, co); diff != "" { t.Fatalf("unexpected result (-want, +got):\n%s\n", diff) } } diff --git a/src/control/drpc/drpc_server.go b/src/control/drpc/drpc_server.go index 6627e4b6f47..35f85ef5758 100644 --- a/src/control/drpc/drpc_server.go +++ b/src/control/drpc/drpc_server.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2022 Intel Corporation. +// (C) Copyright 2018-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -87,28 +87,51 @@ func (d *DomainSocketServer) Listen(ctx context.Context) { // Start sets up the dRPC server socket and kicks off the listener goroutine. func (d *DomainSocketServer) Start(ctx context.Context) error { - // Just in case an old socket file is still lying around - if err := syscall.Unlink(d.sockFile); err != nil && !os.IsNotExist(err) { - return errors.Wrapf(err, "Unable to unlink %s", d.sockFile) + if d == nil { + return errors.New("DomainSocketServer is nil") } addr := &net.UnixAddr{Name: d.sockFile, Net: "unixpacket"} + if err := d.checkExistingSocket(ctx, addr); err != nil { + return err + } + lis, err := net.ListenUnix("unixpacket", addr) if err != nil { - return errors.Wrapf(err, "Unable to listen on unix socket %s", d.sockFile) + return errors.Wrapf(err, "unable to listen on unix socket %s", d.sockFile) } d.listener = lis - // The only writer should be the I/O Engines which should be running as the same user as - // daos_server process. if err := os.Chmod(d.sockFile, d.sockFileMode); err != nil { - return errors.Wrapf(err, "Unable to set permissions on %s", d.sockFile) + return errors.Wrapf(err, "unable to set permissions on %s", d.sockFile) } go d.Listen(ctx) return nil } +func (d *DomainSocketServer) checkExistingSocket(ctx context.Context, addr *net.UnixAddr) error { + conn, err := net.DialUnix("unixpacket", nil, addr) + if err == nil { + _ = conn.Close() + return FaultSocketFileInUse(d.sockFile) + } + + if errors.Is(err, syscall.ENOENT) { + return nil + } + + if errors.Is(err, syscall.ECONNREFUSED) { + // File exists but no one is listening - it's safe to delete. + if err := syscall.Unlink(addr.Name); err != nil && !os.IsNotExist(err) { + return errors.Wrap(err, "unlink old socket file") + } + return nil + } + + return err +} + // RegisterRPCModule takes a Module and associates it with the given // DomainSocketServer so it can be used to process incoming dRPC calls. func (d *DomainSocketServer) RegisterRPCModule(mod Module) { diff --git a/src/control/drpc/drpc_server_test.go b/src/control/drpc/drpc_server_test.go index a63175f95a9..1713a6e8b43 100644 --- a/src/control/drpc/drpc_server_test.go +++ b/src/control/drpc/drpc_server_test.go @@ -17,6 +17,8 @@ import ( "google.golang.org/protobuf/proto" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" "github.com/daos-stack/daos/src/control/logging" ) @@ -144,52 +146,119 @@ func TestNewDomainSocketServer(t *testing.T) { test.AssertEqual(t, dss.sockFile, expectedSock, "wrong sockfile") } -func TestServer_Start_CantUnlinkSocket(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - - tmpDir, tmpCleanup := test.CreateTestDir(t) - defer tmpCleanup() - - path := filepath.Join(tmpDir, "test.sock") - - // Forbid searching the directory - if err := os.Chmod(tmpDir, 0000); err != nil { - t.Fatalf("Couldn't change permissions on dir: %v", err) +func TestDrpc_DomainSocketServer_Start(t *testing.T) { + sockPath := func(dir string) string { + return filepath.Join(dir, "test.sock") } - defer func() { - _ = os.Chmod(tmpDir, 0700) - }() - - dss, _ := NewDomainSocketServer(log, path, testFileMode) - - err := dss.Start(test.Context(t)) - - test.CmpErr(t, errors.New("unlink"), err) -} -func TestServer_Start_CantListen(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - - tmpDir, tmpCleanup := test.CreateTestDir(t) - defer tmpCleanup() - - path := filepath.Join(tmpDir, "test.sock") - - // Forbid writing the directory - if err := os.Chmod(tmpDir, 0500); err != nil { - t.Fatalf("Couldn't change permissions on dir: %v", err) + for name, tc := range map[string]struct { + nilServer bool + setup func(t *testing.T, dir string) func() + expErr error + }{ + "nil": { + nilServer: true, + expErr: errors.New("nil"), + }, + "unused existing socket file": { + setup: func(t *testing.T, dir string) func() { + t.Helper() + + f, err := os.Create(sockPath(dir)) + if err != nil { + t.Fatal(err) + } + _ = f.Close() + return func() {} + }, + }, + "can't unlink old socket file": { + setup: func(t *testing.T, dir string) func() { + t.Helper() + + sockFile := sockPath(dir) + f, err := os.Create(sockFile) + if err != nil { + t.Fatal(err) + } + _ = f.Close() + + if err := os.Chmod(dir, 0500); err != nil { + t.Fatalf("Couldn't change permissions on dir: %v", err) + } + return func() { + _ = os.Chmod(dir, 0700) + } + }, + expErr: errors.New("unlink"), + }, + "socket file in use": { + setup: func(t *testing.T, dir string) func() { + t.Helper() + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + other, err := NewDomainSocketServer(log, sockPath(dir), testFileMode) + if err != nil { + t.Fatalf("can't create first server: %s", err.Error()) + } + + err = other.Start(test.Context(t)) + if err != nil { + t.Fatalf("can't start up first server: %s", err.Error()) + } + + // NB: The started server is shut down when the test context is canceled. + return func() {} + }, + expErr: FaultSocketFileInUse(""), + }, + "listen fails": { + setup: func(t *testing.T, dir string) func() { + t.Helper() + + if err := os.Chmod(dir, 0500); err != nil { + t.Fatalf("Couldn't change permissions on dir: %v", err) + } + return func() { + _ = os.Chmod(dir, 0700) + } + }, + expErr: errors.New("listen"), + }, + "success": {}, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + tmpDir, tmpCleanup := test.CreateTestDir(t) + defer tmpCleanup() + + if tc.setup != nil { + teardown := tc.setup(t, tmpDir) + defer teardown() + } + + // Test hack - make sure the right path is included in the fault message for comparison + if fault.IsFaultCode(tc.expErr, code.SocketFileInUse) { + tc.expErr = FaultSocketFileInUse(sockPath(tmpDir)) + } + + var err error + var dss *DomainSocketServer + if !tc.nilServer { + dss, err = NewDomainSocketServer(log, sockPath(tmpDir), testFileMode) + if err != nil { + t.Fatal(err) + } + } + + err = dss.Start(test.Context(t)) + + test.CmpErr(t, tc.expErr, err) + }) } - defer func() { - _ = os.Chmod(tmpDir, 0700) - }() - - dss, _ := NewDomainSocketServer(log, path, testFileMode) - - err := dss.Start(test.Context(t)) - - test.CmpErr(t, errors.New("listen"), err) } func TestServer_RegisterModule(t *testing.T) { diff --git a/src/control/drpc/fault.go b/src/control/drpc/fault.go new file mode 100644 index 00000000000..f41a7882a98 --- /dev/null +++ b/src/control/drpc/fault.go @@ -0,0 +1,27 @@ +// +// (C) Copyright 2023 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package drpc + +import ( + "fmt" + + "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/fault/code" +) + +// FaultSocketFileInUse indicates that the dRPC socket file was already in use when we tried +// to start the dRPC server. +func FaultSocketFileInUse(path string) *fault.Fault { + return &fault.Fault{ + Domain: "drpc", + Code: code.SocketFileInUse, + Description: fmt.Sprintf("Configured dRPC socket file '%s' is already in use.", path), + Reason: "dRPC socket file already in use", + Resolution: "If another process is using the socket file, configure a different socket directory. " + + "Otherwise, delete the existing socket file and try again.", + } +} diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go index 89bfb32bed0..4a368aeb5e3 100644 --- a/src/control/fault/code/codes.go +++ b/src/control/fault/code/codes.go @@ -50,6 +50,7 @@ const ( PrivilegedHelperNotPrivileged PrivilegedHelperNotAvailable PrivilegedHelperRequestFailed + SocketFileInUse ) // generic storage fault codes diff --git a/src/control/lib/control/mocks.go b/src/control/lib/control/mocks.go index 8427cbd6846..c70bd89e579 100644 --- a/src/control/lib/control/mocks.go +++ b/src/control/lib/control/mocks.go @@ -377,7 +377,7 @@ func MockServerScanResp(t *testing.T, variant string) *ctlpb.StorageScanResp { for _, i := range []int{1, 2, 3, 4, 5, 6, 7, 8} { nc := storage.MockNvmeController(int32(i)) nc.SocketID = int32(i % 2) - sd := storage.MockSmdDevice(nc.PciAddr, int32(i)) + sd := storage.MockSmdDevice(nc, int32(i)) sd.TotalBytes = uint64(humanize.TByte) * uint64(i) sd.AvailBytes = uint64((humanize.TByte/4)*3) * uint64(i) // 25% used sd.UsableBytes = uint64((humanize.TByte/4)*3) * uint64(i) // 25% used @@ -673,17 +673,18 @@ func MockStorageScanResp(t *testing.T, nvmeControllers := make(storage.NvmeControllers, 0, len(mockNvmeConfigArray)) for index, mockNvmeConfig := range mockNvmeConfigArray { nvmeController := storage.MockNvmeController(int32(index)) - smdDevice := nvmeController.SmdDevices[0] + smdDevice := storage.MockSmdDevice(nvmeController, int32(index)) smdDevice.AvailBytes = mockNvmeConfig.AvailBytes smdDevice.UsableBytes = mockNvmeConfig.UsableBytes smdDevice.TotalBytes = mockNvmeConfig.TotalBytes if mockNvmeConfig.NvmeState != nil { - smdDevice.Ctrlr.NvmeState = *mockNvmeConfig.NvmeState + nvmeController.NvmeState = *mockNvmeConfig.NvmeState } if mockNvmeConfig.NvmeRole != nil { smdDevice.Roles = *mockNvmeConfig.NvmeRole } smdDevice.Rank = mockNvmeConfig.Rank + nvmeController.SmdDevices = []*storage.SmdDevice{smdDevice} nvmeControllers = append(nvmeControllers, nvmeController) } if err := convert.Types(nvmeControllers, &serverScanResponse.Nvme.Ctrlrs); err != nil { diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index 3036eaa4456..79b1efaf1e2 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -1361,35 +1361,35 @@ func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespace // Add NVMe free bytes to rankNVMeFreeSpace map. func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControllers storage.NvmeControllers, rankNVMeFreeSpace rankFreeSpaceMap) error { - for _, nvmeController := range nvmeControllers { - for _, smdDevice := range nvmeController.SmdDevices { + for _, controller := range nvmeControllers { + for _, smdDevice := range controller.SmdDevices { if !smdDevice.Roles.IsEmpty() && (smdDevice.Roles.OptionBits&storage.BdevRoleData) == 0 { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not used for storing data", - smdDevice.UUID, smdDevice.Rank, smdDevice.Ctrlr.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) continue } - if smdDevice.Ctrlr.NvmeState != storage.NvmeStateNormal { + if controller.NvmeState != storage.NvmeStateNormal { return errors.Errorf("SMD device %s (rank %d, ctrlr %s) not usable (device state %q)", - smdDevice.UUID, smdDevice.Rank, smdDevice.Ctrlr.PciAddr, smdDevice.Ctrlr.NvmeState.String()) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr, controller.NvmeState.String()) continue } if !filterRank(smdDevice.Rank) { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not in ranklist", - smdDevice.UUID, smdDevice.Rank, smdDevice.Ctrlr.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) continue } if _, exists := rankNVMeFreeSpace[smdDevice.Rank]; !exists { return errors.Errorf("Rank %d without SCM device and at least one SMD device %s (rank %d, ctrlr %s)", - smdDevice.Rank, smdDevice.UUID, smdDevice.Rank, smdDevice.Ctrlr.PciAddr) + smdDevice.Rank, smdDevice.UUID, smdDevice.Rank, controller.PciAddr) } rankNVMeFreeSpace[smdDevice.Rank] += smdDevice.UsableBytes log.Debugf("Added SMD device %s (rank %d, ctrlr %s) is usable: device state=%q, smd-size=%d ctrlr-total-free=%d", - smdDevice.UUID, smdDevice.Rank, smdDevice.Ctrlr.PciAddr, smdDevice.Ctrlr.NvmeState.String(), + smdDevice.UUID, smdDevice.Rank, controller.PciAddr, controller.NvmeState.String(), smdDevice.UsableBytes, rankNVMeFreeSpace[smdDevice.Rank]) } } diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 36d757255ee..12f8389eadc 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -237,10 +237,10 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe Usage: req.Usage, }, Nvme: &ctlpb.ScanNvmeReq{ - Health: req.NvmeHealth, - // NVMe meta option will populate usage statistics - Meta: req.NvmeMeta || req.Usage, Basic: req.NvmeBasic, + // Health and meta details required to populate usage statistics. + Health: req.NvmeHealth || req.Usage, + Meta: req.NvmeMeta || req.Usage, }, }) }) diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 6be17cf4ae6..48ead6d68fb 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -289,9 +289,7 @@ func TestServerConfig_Constructed(t *testing.T) { WithEnvVars("CRT_TIMEOUT=30"). WithLogFile("/tmp/daos_engine.0.log"). WithLogMask("INFO"). - WithStorageEnableHotplug(true). - WithStorageAccelProps(storage.AccelEngineSPDK, - storage.AccelOptCRCFlag|storage.AccelOptMoveFlag), + WithStorageEnableHotplug(true), engine.MockConfig(). WithSystemName("daos_server"). WithSocketDir("./.daos/daos_server"). @@ -318,8 +316,7 @@ func TestServerConfig_Constructed(t *testing.T) { WithEnvVars("CRT_TIMEOUT=100"). WithLogFile("/tmp/daos_engine.1.log"). WithLogMask("INFO"). - WithStorageEnableHotplug(true). - WithStorageAccelProps(storage.AccelEngineDML, storage.AccelOptCRCFlag), + WithStorageEnableHotplug(true), } constructed.Path = testFile // just to avoid failing the cmp diff --git a/src/control/server/ctl_ranks_rpc.go b/src/control/server/ctl_ranks_rpc.go index a65929a20db..c3257f1e941 100644 --- a/src/control/server/ctl_ranks_rpc.go +++ b/src/control/server/ctl_ranks_rpc.go @@ -302,7 +302,7 @@ func (svc *ControlService) StartRanks(ctx context.Context, req *ctlpb.RanksReq) // ignore poll results as we gather state immediately after pollFn := func(e Engine) bool { return e.IsReady() } if err := pollInstanceState(ctx, instances, pollFn); err != nil { - return nil, errors.Wrap(err, "waiting for engines to start") + return nil, errors.Wrap(err, "waiting for engines to be ready to receive drpcs") } // instances will update state to "Started" through join or diff --git a/src/control/server/ctl_storage.go b/src/control/server/ctl_storage.go index f4747f87513..443f2a0bc76 100644 --- a/src/control/server/ctl_storage.go +++ b/src/control/server/ctl_storage.go @@ -7,8 +7,6 @@ package server import ( - "context" - "fmt" "path/filepath" "strings" @@ -16,7 +14,6 @@ import ( "github.com/pkg/errors" "github.com/daos-stack/daos/src/control/common" - "github.com/daos-stack/daos/src/control/common/proto/ctl" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/engine" "github.com/daos-stack/daos/src/control/server/storage" @@ -52,7 +49,7 @@ func (scs *StorageControlService) NvmeScan(req storage.BdevScanRequest) (*storag // WithVMDEnabled enables VMD support in storage provider. func (scs *StorageControlService) WithVMDEnabled() *StorageControlService { - scs.storage.WithVMDEnabled() + scs.storage.WithVMDEnabled(true) return scs } @@ -156,93 +153,3 @@ func (cs *ControlService) getScmUsage(ssr *storage.ScmScanResponse) (*storage.Sc return &storage.ScmScanResponse{Namespaces: nss}, nil } - -// scanAssignedBdevs retrieves up-to-date NVMe controller info including -// health statistics and stored server meta-data. If I/O Engines are running -// then query is issued over dRPC as go-spdk bindings cannot be used to access -// controller claimed by another process. Only update info for controllers -// assigned to I/O Engines. -func (cs *ControlService) scanAssignedBdevs(ctx context.Context, nsps []*ctl.ScmNamespace, statsReq bool) (*storage.BdevScanResponse, error) { - instances := cs.harness.Instances() - ctrlrs := new(storage.NvmeControllers) - - for _, ei := range instances { - if !ei.GetStorage().HasBlockDevices() { - continue - } - - tsrs, err := ei.ScanBdevTiers() - if err != nil { - return nil, err - } - - // Build slice of controllers in all tiers. - tierCtrlrs := make([]storage.NvmeController, 0) - msg := fmt.Sprintf("NVMe tiers for engine-%d:", ei.Index()) - for _, tsr := range tsrs { - msg += fmt.Sprintf("\n\tTier-%d: %s", tsr.Tier, tsr.Result.Controllers) - for _, c := range tsr.Result.Controllers { - tierCtrlrs = append(tierCtrlrs, *c) - } - } - cs.log.Info(msg) - - // If the engine is not running or we aren't interested in temporal - // statistics for the bdev devices then continue to next engine. - if !ei.IsReady() || !statsReq { - ctrlrs.Update(tierCtrlrs...) - continue - } - - cs.log.Debugf("updating stats for %d bdev(s) on instance %d", len(tierCtrlrs), - ei.Index()) - - // DAOS-12750 Compute the maximal size of the metadata to allow the engine to fill - // the WallMeta field response. The maximal metadata (i.e. VOS index file) size - // should be equal to the SCM available size divided by the number of targets of the - // engine. - var md_size uint64 - var rdb_size uint64 - for _, nsp := range nsps { - mp := nsp.GetMount() - if mp == nil { - continue - } - if r, err := ei.GetRank(); err != nil || uint32(r) != mp.GetRank() { - continue - } - - // NOTE DAOS-14223: This metadata size calculation won't necessarily match - // the meta blob size on SSD if --meta-size is specified in - // pool create command. - md_size = mp.GetUsableBytes() / uint64(ei.GetTargetCount()) - - engineCfg, err := cs.getEngineCfgFromScmNsp(nsp) - if err != nil { - return nil, errors.Wrap(err, "Engine with invalid configuration") - } - rdb_size, err = cs.getRdbSize(engineCfg) - if err != nil { - return nil, err - } - break - } - - if md_size == 0 { - cs.log.Noticef("instance %d: no SCM space available for metadata", ei.Index) - } - - // If engine is running and has claimed the assigned devices for - // each tier, iterate over scan results for each tier and send query - // over drpc to update controller details with current health stats - // and smd info. - updatedCtrlrs, err := ei.updateInUseBdevs(ctx, tierCtrlrs, md_size, rdb_size) - if err != nil { - return nil, errors.Wrapf(err, "instance %d: update online bdevs", ei.Index()) - } - - ctrlrs.Update(updatedCtrlrs...) - } - - return &storage.BdevScanResponse{Controllers: *ctrlrs}, nil -} diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 7ad81f35f64..a7a87de805a 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -61,32 +61,82 @@ func newResponseState(inErr error, badStatus ctlpb.ResponseStatus, infoMsg strin return rs } -// stripNvmeDetails removes all controller details leaving only PCI address and -// NUMA node/socket ID. Useful when scanning only device topology. -func stripNvmeDetails(pbc *ctlpb.NvmeController) { - pbc.Serial = "" - pbc.Model = "" - pbc.FwRev = "" -} +// Package-local function variables for mocking in unit tests. +var ( + scanBdevs = bdevScan // StorageScan() unit tests + scanEngineBdevs = bdevScanEngine // bdevScan() unit tests + computeMetaRdbSz = metaRdbComputeSz // TODO unit tests +) -// newScanBdevResp populates protobuf NVMe scan response with controller info -// including health statistics or metadata if requested. -func newScanNvmeResp(req *ctlpb.ScanNvmeReq, inResp *storage.BdevScanResponse, inErr error) (*ctlpb.ScanNvmeResp, error) { - outResp := new(ctlpb.ScanNvmeResp) - outResp.State = new(ctlpb.ResponseState) +type scanBdevsFn func(storage.BdevScanRequest) (*storage.BdevScanResponse, error) - if inErr != nil { - outResp.State = newResponseState(inErr, ctlpb.ResponseStatus_CTL_ERR_NVME, "") - return outResp, nil +// Convert bdev scan results to protobuf response. +func bdevScanToProtoResp(scan scanBdevsFn, req storage.BdevScanRequest) (*ctlpb.ScanNvmeResp, error) { + resp, err := scan(req) + if err != nil { + return nil, err } - pbCtrlrs := make(proto.NvmeControllers, 0, len(inResp.Controllers)) - if err := pbCtrlrs.FromNative(inResp.Controllers); err != nil { + pbCtrlrs := make(proto.NvmeControllers, 0, len(resp.Controllers)) + + if err := pbCtrlrs.FromNative(resp.Controllers); err != nil { return nil, err } - // trim unwanted fields so responses can be coalesced from hash map - for _, pbc := range pbCtrlrs { + return &ctlpb.ScanNvmeResp{ + State: new(ctlpb.ResponseState), + Ctrlrs: pbCtrlrs, + }, nil +} + +// Scan bdevs through harness's ControlService (not per-engine). +func bdevScanGlobal(cs *ControlService, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) { + req := storage.BdevScanRequest{DeviceList: cfgBdevs} + return bdevScanToProtoResp(cs.storage.ScanBdevs, req) +} + +// Scan bdevs through each engine and collate response results. +func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (*ctlpb.ScanNvmeResp, error) { + var errLast error + instances := cs.harness.Instances() + resp := &ctlpb.ScanNvmeResp{} + + for _, ei := range instances { + eReq := new(ctlpb.ScanNvmeReq) + *eReq = *req + if req.Meta { + ms, rs, err := computeMetaRdbSz(cs, ei, nsps) + if err != nil { + return nil, errors.Wrap(err, "computing meta and rdb size") + } + eReq.MetaSize, eReq.RdbSize = ms, rs + } + + respEng, err := scanEngineBdevs(ctx, ei, eReq) + if err != nil { + err = errors.Wrapf(err, "instance %d", ei.Index()) + if errLast == nil && len(instances) > 1 { + errLast = err // Save err to preserve partial results. + cs.log.Error(err.Error()) + continue + } + return nil, err // No partial results to save so fail. + } + resp.Ctrlrs = append(resp.Ctrlrs, respEng.Ctrlrs...) + } + + // If one engine succeeds and one other fails, error is embedded in the response. + resp.State = newResponseState(errLast, ctlpb.ResponseStatus_CTL_ERR_NVME, "") + + return resp, nil +} + +// Trim unwanted fields so responses can be coalesced from hash map when returned from server. +func bdevScanTrimResults(req *ctlpb.ScanNvmeReq, resp *ctlpb.ScanNvmeResp) *ctlpb.ScanNvmeResp { + if resp == nil { + return nil + } + for _, pbc := range resp.Ctrlrs { if !req.GetHealth() { pbc.HealthStats = nil } @@ -94,40 +144,85 @@ func newScanNvmeResp(req *ctlpb.ScanNvmeReq, inResp *storage.BdevScanResponse, i pbc.SmdDevices = nil } if req.GetBasic() { - stripNvmeDetails(pbc) + pbc.Serial = "" + pbc.Model = "" + pbc.FwRev = "" } } - outResp.Ctrlrs = pbCtrlrs + return resp +} - return outResp, nil +func engineHasStarted(instances []Engine) bool { + for _, ei := range instances { + if ei.IsStarted() { + return true + } + } + + return false +} + +func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace, hasStarted *bool, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) { + *hasStarted = engineHasStarted(cs.harness.Instances()) + if !*hasStarted { + cs.log.Debugf("scan bdevs from control service as no engines started") + return bdevScanGlobal(cs, cfgBdevs) + } + + // Delegate scan to engine instances as soon as one engine with assigned bdevs has started. + cs.log.Debugf("scan assigned bdevs through engine instances as some are started") + return bdevScanEngines(ctx, cs, req, nsps) } -// scanBdevs updates transient details if health statistics or server metadata -// is requested otherwise just retrieves cached static controller details. -func (c *ControlService) scanBdevs(ctx context.Context, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (*ctlpb.ScanNvmeResp, error) { +// Return NVMe device details. The scan method employed depends on whether the engines are running +// or not. If running, scan over dRPC. If not running then use engine's storage provider. +func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (resp *ctlpb.ScanNvmeResp, err error) { if req == nil { - return nil, errors.New("nil bdev request") + return nil, errors.New("nil request") } - var bdevsInCfg bool - for _, ei := range c.harness.Instances() { - if ei.GetStorage().HasBlockDevices() { - bdevsInCfg = true + cfgBdevs := getBdevCfgsFromSrvCfg(cs.srvCfg).Bdevs() + + if cfgBdevs.Len() == 0 { + cs.log.Debugf("scan bdevs from control service as no bdevs in cfg") + + // No bdevs configured for engines to claim so scan through control service. + resp, err = bdevScanGlobal(cs, cfgBdevs) + if err != nil { + return nil, err } + return bdevScanTrimResults(req, resp), nil + } + + // Note the potential window where engines are started but not yet ready to respond. In this + // state there is a possibility that neither scan mechanism will work because devices have + // been claimed by SPDK but details are not yet available over dRPC. + + var hasStarted bool + resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs) + if err != nil { + return nil, err } - if !bdevsInCfg { - c.log.Debugf("no bdevs in cfg so scan all") - // return details of all bdevs if none are assigned to engines - resp, err := c.storage.ScanBdevs(storage.BdevScanRequest{}) - return newScanNvmeResp(req, resp, err) + // Retry once if global scan returns unexpected number of controllers in case engines + // claimed devices between when started state was checked and scan was executed. + if !hasStarted && len(resp.Ctrlrs) != cfgBdevs.Len() { + cs.log.Debugf("retrying bdev scan as unexpected nr returned, want %d got %d", + cfgBdevs.Len(), len(resp.Ctrlrs)) + + resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs) + if err != nil { + return nil, err + } } - c.log.Debugf("bdevs in cfg so scan only assigned") - resp, err := c.scanAssignedBdevs(ctx, nsps, req.GetHealth() || req.GetMeta()) + if len(resp.Ctrlrs) != cfgBdevs.Len() { + cs.log.Noticef("bdev scan returned unexpected nr, want %d got %d", + cfgBdevs.Len(), len(resp.Ctrlrs)) + } - return newScanNvmeResp(req, resp, err) + return bdevScanTrimResults(req, resp), nil } // newScanScmResp sets protobuf SCM scan response with module or namespace info. @@ -174,8 +269,6 @@ func (c *ControlService) scanScm(ctx context.Context, req *ctlpb.ScanScmReq) (*c // Returns the engine configuration managing the given NVMe controller func (c *ControlService) getEngineCfgFromNvmeCtl(nc *ctl.NvmeController) (*engine.Config, error) { - var engineCfg *engine.Config - pciAddr, err := hardware.NewPCIAddress(nc.GetPciAddr()) if err != nil { return nil, errors.Errorf("Invalid PCI address: %s", err) @@ -188,58 +281,33 @@ func (c *ControlService) getEngineCfgFromNvmeCtl(nc *ctl.NvmeController) (*engin ctlrAddr := pciAddr.String() for index := range c.srvCfg.Engines { - if engineCfg != nil { - break - } - for _, tierCfg := range c.srvCfg.Engines[index].Storage.Tiers { - if engineCfg != nil { - break - } - if !tierCfg.IsBdev() { continue } - for _, devName := range tierCfg.Bdev.DeviceList.Devices() { if devName == ctlrAddr { - engineCfg = c.srvCfg.Engines[index] - break + return c.srvCfg.Engines[index], nil } - } } } - if engineCfg == nil { - return nil, errors.Errorf("unknown PCI device %q", pciAddr) - } - - return engineCfg, nil + return nil, errors.Errorf("unknown PCI device %q", pciAddr) } // Returns the engine configuration managing the given SCM name-space func (c *ControlService) getEngineCfgFromScmNsp(nsp *ctl.ScmNamespace) (*engine.Config, error) { - var engineCfg *engine.Config mountPoint := nsp.GetMount().Path for index := range c.srvCfg.Engines { - if engineCfg != nil { - break - } - for _, tierCfg := range c.srvCfg.Engines[index].Storage.Tiers { if tierCfg.IsSCM() && tierCfg.Scm.MountPoint == mountPoint { - engineCfg = c.srvCfg.Engines[index] - break + return c.srvCfg.Engines[index], nil } } } - if engineCfg == nil { - return nil, errors.Errorf("unknown SCM mount point %s", mountPoint) - } - - return engineCfg, nil + return nil, errors.Errorf("unknown SCM mount point %s", mountPoint) } // return the size of the RDB file used for managing SCM metadata @@ -247,7 +315,8 @@ func (c *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { mdCapStr, err := engineCfg.GetEnvVar(daos.DaosMdCapEnv) if err != nil { c.log.Debugf("using default RDB file size with engine %d: %s (%d Bytes)", - engineCfg.Index, humanize.Bytes(daos.DefaultDaosMdCapSize), daos.DefaultDaosMdCapSize) + engineCfg.Index, humanize.Bytes(daos.DefaultDaosMdCapSize), + daos.DefaultDaosMdCapSize) return uint64(daos.DefaultDaosMdCapSize), nil } @@ -263,6 +332,43 @@ func (c *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { return rdbSize, nil } +// Compute the maximal size of the metadata to allow the engine to fill the WallMeta field +// response. The maximal metadata (i.e. VOS index file) size should be equal to the SCM available +// size divided by the number of targets of the engine. +func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace) (md_size, rdb_size uint64, errOut error) { + for _, nsp := range nsps { + mp := nsp.GetMount() + if mp == nil { + continue + } + if r, err := ei.GetRank(); err != nil || uint32(r) != mp.GetRank() { + continue + } + + // NOTE DAOS-14223: This metadata size calculation won't necessarily match + // the meta blob size on SSD if --meta-size is specified in + // pool create command. + md_size = mp.GetUsableBytes() / uint64(ei.GetTargetCount()) + + engineCfg, err := cs.getEngineCfgFromScmNsp(nsp) + if err != nil { + errOut = errors.Wrap(err, "Engine with invalid configuration") + return + } + rdb_size, errOut = cs.getRdbSize(engineCfg) + if errOut != nil { + return + } + break + } + + if md_size == 0 { + cs.log.Noticef("instance %d: no SCM space available for metadata", ei.Index) + } + + return +} + type deviceToAdjust struct { ctlr *ctl.NvmeController idx int @@ -367,9 +473,9 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { continue } - if dev.Ctrlr.GetDevState() != ctlpb.NvmeDevState_NORMAL { + if ctlr.GetDevState() != ctlpb.NvmeDevState_NORMAL { c.log.Debugf("SMD device %s (rank %d, ctlr %s) not usable: device state %q", - dev.GetUuid(), rank, ctlr.GetPciAddr(), ctlpb.NvmeDevState_name[int32(dev.Ctrlr.DevState)]) + dev.GetUuid(), rank, ctlr.GetPciAddr(), ctlpb.NvmeDevState_name[int32(ctlr.DevState)]) dev.AvailBytes = 0 dev.UsableBytes = 0 continue @@ -522,6 +628,10 @@ func (c *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScan // be applied if only the Meta flag is set in the NVMe component of the request to continue // to support off-line storage scan functionality which uses cached stats (e.g. dmg storage // scan --nvme-meta). + // + // TODO DAOS-13228: Remove --nvme-meta scan option and the below workaround. + // If usage or meta requested, fail if no engines started and skip stopped + // engines in bdev scan. Only return results for ready engines over dRPC. if req.Scm.Usage && req.Nvme.Meta { nrInstances := len(c.harness.Instances()) readyRanks := c.harness.readyRanks() @@ -541,7 +651,7 @@ func (c *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScan } resp.Scm = respScm - respNvme, err := c.scanBdevs(ctx, req.Nvme, respScm.Namespaces) + respNvme, err := scanBdevs(ctx, c, req.Nvme, respScm.Namespaces) if err != nil { return nil, err } @@ -713,29 +823,49 @@ type formatNvmeReq struct { func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageFormatResp) { // Allow format to complete on one instance even if another fails - // TODO: perform bdev format in parallel - for idx, ei := range req.instances { + for idx, engine := range req.instances { _, hasError := req.errored[idx] _, skipped := req.skipped[idx] if hasError || (skipped && !req.mdFormatted) { - // if scm failed to format or was already formatted, indicate skipping bdev format - ret := ei.newCret(storage.NilBdevAddress, nil) - ret.State.Info = fmt.Sprintf(msgNvmeFormatSkip, ei.Index()) + // If scm failed to format or was already formatted, skip bdev format. + ret := engine.newCret(storage.NilBdevAddress, nil) + ret.State.Info = fmt.Sprintf(msgNvmeFormatSkip, engine.Index()) resp.Crets = append(resp.Crets, ret) continue } + respBdevs, err := scanEngineBdevs(ctx, engine, new(ctlpb.ScanNvmeReq)) + if err != nil { + if errors.Is(err, errEngineBdevScanEmptyDevList) { + // No controllers assigned in config, continue. + continue + } + req.errored[idx] = err.Error() + resp.Crets = append(resp.Crets, engine.newCret("", err)) + continue + } + + // Convert proto ctrlr scan results to native when calling into storage provider. + pbCtrlrs := proto.NvmeControllers(respBdevs.Ctrlrs) + ctrlrs, err := pbCtrlrs.ToNative() + if err != nil { + req.errored[idx] = err.Error() + resp.Crets = append(resp.Crets, engine.newCret("", err)) + continue + } + // SCM formatted correctly on this instance, format NVMe - cResults := ei.StorageFormatNVMe() + cResults := formatEngineBdevs(engine.(*EngineInstance), ctrlrs) + if cResults.HasErrors() { req.errored[idx] = cResults.Errors() resp.Crets = append(resp.Crets, cResults...) continue } - if err := ei.GetStorage().WriteNvmeConfig(ctx, req.log); err != nil { + if err := engine.GetStorage().WriteNvmeConfig(ctx, req.log, ctrlrs); err != nil { req.errored[idx] = err.Error() - cResults = append(cResults, ei.newCret("", err)) + cResults = append(cResults, engine.newCret("", err)) } resp.Crets = append(resp.Crets, cResults...) @@ -879,8 +1009,9 @@ func (c *ControlService) StorageNvmeAddDevice(ctx context.Context, req *ctlpb.Nv } c.log.Debugf("updated bdev list: %+v", tierCfg.Bdev.DeviceList) + // TODO: Supply scan results for VMD backing device address mapping. resp = new(ctlpb.NvmeAddDeviceResp) - if err := engineStorage.WriteNvmeConfig(ctx, c.log); err != nil { + if err := engineStorage.WriteNvmeConfig(ctx, c.log, nil); err != nil { err = errors.Wrapf(err, "write nvme config for engine %d", engineIndex) c.log.Error(err.Error()) diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index 0674198e22f..3fb19e80300 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -43,9 +43,468 @@ const defaultRdbSize uint64 = uint64(daos.DefaultDaosMdCapSize) var ( defStorageScanCmpOpts = append(test.DefaultCmpOpts(), protocmp.IgnoreFields(&ctlpb.NvmeController{}, "serial")) + defProviderScanRes = &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + storage.MockNvmeController(1), + }, + } + defEngineScanRes = &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + } ) -func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { +func TestServer_bdevScan(t *testing.T) { + for name, tc := range map[string]struct { + req *ctlpb.ScanNvmeReq + provRes *storage.BdevScanResponse + provErr error + engTierCfgs []storage.TierConfigs // one per-engine + engStopped []bool // one per-engine (all false if unset) + engRes []ctlpb.ScanNvmeResp // one per-engine + engErr []error // one per-engine + expResp *ctlpb.ScanNvmeResp + expErr error + expBackendScanCalls []storage.BdevScanRequest + }{ + "nil request": { + expErr: errors.New("nil request"), + }, + "no bdevs in config; scan local fails": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{{}}, + provErr: errors.New("fail"), + engStopped: []bool{false}, + expErr: errors.New("fail"), + }, + "no bdevs in config; scan local; devlist passed to backend": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{{}}, + engStopped: []bool{false}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + {DeviceList: new(storage.BdevDeviceList)}, + }, + }, + "bdevs in config; engine not started; scan local; devlist passed to backend": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + storage.MockNvmeController(1), + storage.MockNvmeController(2), + }, + }, + engStopped: []bool{true}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "bdevs in config; engine not started; scan local; retry on empty response": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{}, + }, + engStopped: []bool{true}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{}, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "bdevs in config; engine started; scan remote": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan remote; collate results from multiple engines": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(3), + test.MockPCIAddr(4)), + }, + }, + engRes: []ctlpb.ScanNvmeResp{ + { + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + { + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(3), + proto.MockNvmeController(4), + }, + State: new(ctlpb.ResponseState), + }, + }, + engErr: []error{nil, nil}, + engStopped: []bool{false, false}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + proto.MockNvmeController(2), + proto.MockNvmeController(3), + proto.MockNvmeController(4), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan remote; both engine scans fail": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(3), + test.MockPCIAddr(4)), + }, + }, + engRes: []ctlpb.ScanNvmeResp{{}, {}}, + engErr: []error{errors.New("fail1"), errors.New("fail2")}, + engStopped: []bool{false, false}, + expErr: errors.New("fail2"), + }, + "scan remote; partial results with one failed engine scan": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(3), + test.MockPCIAddr(4)), + }, + }, + engRes: []ctlpb.ScanNvmeResp{ + {}, + { + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(3), + proto.MockNvmeController(4), + }, + State: new(ctlpb.ResponseState), + }, + }, + engErr: []error{errors.New("fail"), nil}, + engStopped: []bool{false, false}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(3), + proto.MockNvmeController(4), + }, + State: &ctlpb.ResponseState{ + Error: "instance 0: fail", + Status: ctlpb.ResponseStatus_CTL_ERR_NVME, + }, + }, + }, + "scan remote; filter results based on request basic flag": { + req: &ctlpb.ScanNvmeReq{Basic: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + engRes: []ctlpb.ScanNvmeResp{ + { + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + }, + engErr: []error{nil}, + engStopped: []bool{false}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.HealthStats = nil + nc.SmdDevices = nil + nc.FwRev = "" + nc.Model = "" + return nc + }(), + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(2) + nc.HealthStats = nil + nc.SmdDevices = nil + nc.FwRev = "" + nc.Model = "" + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan local; filter results based on request basic flag": { + req: &ctlpb.ScanNvmeReq{Basic: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + storage.MockNvmeController(1), + storage.MockNvmeController(2), + }, + }, + engStopped: []bool{true}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.HealthStats = nil + nc.SmdDevices = nil + nc.FwRev = "" + nc.Model = "" + return nc + }(), + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(2) + nc.HealthStats = nil + nc.SmdDevices = nil + nc.FwRev = "" + nc.Model = "" + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "bdevs in config; engine not started; scan local; vmd enabled": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList("0000:05:05.5"), + }, + }, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + &storage.NvmeController{PciAddr: "050505:01:00.0"}, + &storage.NvmeController{PciAddr: "050505:03:00.0"}, + }, + }, + engStopped: []bool{true}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + &ctlpb.NvmeController{PciAddr: "050505:01:00.0"}, + &ctlpb.NvmeController{PciAddr: "050505:03:00.0"}, + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + {DeviceList: storage.MustNewBdevDeviceList("0000:05:05.5")}, + {DeviceList: storage.MustNewBdevDeviceList("0000:05:05.5")}, + }, + }, + "bdevs in config; engine started; scan remote; vmd enabled": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList("0000:05:05.5"), + }, + }, + engRes: []ctlpb.ScanNvmeResp{ + { + Ctrlrs: proto.NvmeControllers{ + &ctlpb.NvmeController{PciAddr: "050505:01:00.0"}, + }, + State: new(ctlpb.ResponseState), + }, + }, + engErr: []error{nil}, + engStopped: []bool{false}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + &ctlpb.NvmeController{PciAddr: "050505:01:00.0"}, + }, + State: new(ctlpb.ResponseState), + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + if tc.provRes == nil { + tc.provRes = defProviderScanRes + } + if tc.engRes == nil { + tc.engRes = []ctlpb.ScanNvmeResp{*defEngineScanRes} + } + + if len(tc.engStopped) != len(tc.engTierCfgs) { + t.Fatal("len tc.engStopped != len tc.tierCfgs") + } + + idx := 0 + // Mock per-engine-scan function to focus on unit testing bdevScan(). + scanEngineBdevs = func(_ context.Context, _ Engine, _ *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) { + if len(tc.engRes) <= idx { + t.Fatal("engine scan called but response not specified") + } + if len(tc.engErr) <= idx { + t.Fatal("engine scan called but error not specified") + } + engRes := tc.engRes[idx] + engErr := tc.engErr[idx] + idx++ + return &engRes, engErr + } + defer func() { + scanEngineBdevs = bdevScanEngine + }() + + engCfgs := []*engine.Config{} + for _, tcs := range tc.engTierCfgs { + engCfg := engine.MockConfig().WithStorage(tcs...) + engCfgs = append(engCfgs, engCfg) + } + sCfg := config.DefaultServer().WithEngines(engCfgs...) + + bmbc := &bdev.MockBackendConfig{ + ScanRes: tc.provRes, + ScanErr: tc.provErr, + } + bmb := bdev.NewMockBackend(bmbc) + smb := scm.NewMockBackend(nil) + + cs := newMockControlServiceFromBackends(t, log, sCfg, bmb, smb, nil, + tc.engStopped...) + + resp, err := bdevScan(test.Context(t), cs, tc.req, nil) + test.CmpErr(t, tc.expErr, err) + if err != nil { + return + } + + if diff := cmp.Diff(tc.expResp, resp, + defStorageScanCmpOpts...); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } + + cmpopt := cmp.Comparer(func(x, y *storage.BdevDeviceList) bool { + if x == nil && y == nil { + return true + } + return x.Equals(y) + }) + + bmb.RLock() + if len(tc.expBackendScanCalls) != len(bmb.ScanCalls) { + t.Fatalf("unexpected number of backend scan calls, want %d got %d", + len(tc.expBackendScanCalls), len(bmb.ScanCalls)) + } + if len(tc.expBackendScanCalls) == 0 { + return + } + if diff := cmp.Diff(tc.expBackendScanCalls, bmb.ScanCalls, + append(defStorageScanCmpOpts, cmpopt)...); diff != "" { + t.Fatalf("unexpected backend scan calls (-want, +got):\n%s\n", diff) + } + bmb.RUnlock() + }) + } +} + +func TestServer_CtlSvc_StorageScan(t *testing.T) { ctrlr := storage.MockNvmeController() ctrlr.SmdDevices = nil ctrlrPB := proto.MockNvmeController() @@ -63,22 +522,22 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { ctrlrPBBasic.Model = "" for name, tc := range map[string]struct { - multiEngine bool - req *ctlpb.StorageScanReq - bmbc *bdev.MockBackendConfig - smbc *scm.MockBackendConfig - tierCfgs storage.TierConfigs - expResp *ctlpb.StorageScanResp - expErr error + req *ctlpb.StorageScanReq + bdevScanRes *ctlpb.ScanNvmeResp + bdevScanErr error + smbc *scm.MockBackendConfig + tierCfgs storage.TierConfigs + enginesNotReady bool + expResp *ctlpb.StorageScanResp + expErr error }{ "successful scan; scm namespaces": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ - ctrlr, - storage.MockNvmeController(2), - }, + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, + ctrlrPB2, }, + State: new(ctlpb.ResponseState), }, smbc: &scm.MockBackendConfig{ GetModulesRes: storage.ScmModules{storage.MockScmModule()}, @@ -105,36 +564,15 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { }, }, "successful scan; no scm namespaces": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - smbc: &scm.MockBackendConfig{ - GetModulesRes: storage.ScmModules{storage.MockScmModule()}, - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ctrlrPB}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - Modules: proto.ScmModules{proto.MockScmModule()}, - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, - "successful scan; no bdevs in config": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, }, + State: new(ctlpb.ResponseState), }, smbc: &scm.MockBackendConfig{ GetModulesRes: storage.ScmModules{storage.MockScmModule()}, }, - tierCfgs: storage.TierConfigs{}, expResp: &ctlpb.StorageScanResp{ Nvme: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ctrlrPB}, @@ -147,40 +585,13 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { MemInfo: proto.MockPBMemInfo(), }, }, - "successful scan; missing bdev in config": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - smbc: &scm.MockBackendConfig{ - GetModulesRes: storage.ScmModules{storage.MockScmModule()}, - }, - tierCfgs: storage.TierConfigs{ - storage.NewTierConfig(). - WithStorageClass(storage.ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2)), - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - Modules: proto.ScmModules{proto.MockScmModule()}, - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, "successful scan; multiple bdev tiers in config": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ - ctrlr, - storage.MockNvmeController(2), - }, + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, + ctrlrPB2, }, + State: new(ctlpb.ResponseState), }, smbc: &scm.MockBackendConfig{ GetModulesRes: storage.ScmModules{storage.MockScmModule()}, @@ -209,8 +620,11 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { }, }, "spdk scan failure": { - bmbc: &bdev.MockBackendConfig{ - ScanErr: errors.New("spdk scan failed"), + bdevScanRes: &ctlpb.ScanNvmeResp{ + State: &ctlpb.ResponseState{ + Status: ctlpb.ResponseStatus_CTL_ERR_NVME, + Error: "spdk scan failed", + }, }, smbc: &scm.MockBackendConfig{ GetModulesRes: storage.ScmModules{storage.MockScmModule()}, @@ -231,10 +645,11 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { }, }, "scm module discovery failure": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, }, + State: new(ctlpb.ResponseState), }, smbc: &scm.MockBackendConfig{ GetModulesErr: errors.New("scm discover failed"), @@ -254,8 +669,11 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { }, }, "all discover fail": { - bmbc: &bdev.MockBackendConfig{ - ScanErr: errors.New("spdk scan failed"), + bdevScanRes: &ctlpb.ScanNvmeResp{ + State: &ctlpb.ResponseState{ + Status: ctlpb.ResponseStatus_CTL_ERR_NVME, + Error: "spdk scan failed", + }, }, smbc: &scm.MockBackendConfig{ GetModulesErr: errors.New("scm discover failed"), @@ -276,111 +694,12 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { MemInfo: proto.MockPBMemInfo(), }, }, - "scan bdev health; single engine down": { - req: &ctlpb.StorageScanReq{ - Scm: &ctlpb.ScanScmReq{}, - Nvme: &ctlpb.ScanNvmeReq{ - Health: true, - }, - }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ctrlrPBwHealth}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, - "scan bdev health; multiple engines down": { - multiEngine: true, - req: &ctlpb.StorageScanReq{ - Scm: &ctlpb.ScanScmReq{}, - Nvme: &ctlpb.ScanNvmeReq{ - Health: true, - }, - }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - // response should not contain duplicates - Ctrlrs: proto.NvmeControllers{ctrlrPBwHealth}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, - "scan bdev meta; engines down": { - req: &ctlpb.StorageScanReq{ - Scm: &ctlpb.ScanScmReq{}, - Nvme: &ctlpb.ScanNvmeReq{ - Meta: true, - }, - }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ctrlrPB}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, - "scan bdev; nvme basic set": { - req: &ctlpb.StorageScanReq{ - Scm: &ctlpb.ScanScmReq{}, - Nvme: &ctlpb.ScanNvmeReq{ - Basic: true, - }, - }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ctrlr}, - }, - }, - expResp: &ctlpb.StorageScanResp{ - Nvme: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ctrlrPBBasic}, - State: new(ctlpb.ResponseState), - }, - Scm: &ctlpb.ScanScmResp{ - State: new(ctlpb.ResponseState), - }, - MemInfo: proto.MockPBMemInfo(), - }, - }, "scan bdev; vmd enabled": { - req: &ctlpb.StorageScanReq{ - Scm: &ctlpb.ScanScmReq{}, - Nvme: &ctlpb.ScanNvmeReq{}, - }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ - &storage.NvmeController{PciAddr: "050505:01:00.0"}, - }, + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + &ctlpb.NvmeController{PciAddr: "050505:01:00.0"}, }, + State: new(ctlpb.ResponseState), }, tierCfgs: storage.TierConfigs{ storage.NewTierConfig(). @@ -409,30 +728,31 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { Meta: true, }, }, - expErr: errEngineNotReady, + enginesNotReady: true, + expErr: errEngineNotReady, }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - if tc.tierCfgs == nil { - tc.tierCfgs = storage.TierConfigs{ - storage.NewTierConfig(). - WithStorageClass(storage.ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(1)), - } - } - engineCfg := engine.MockConfig().WithStorage(tc.tierCfgs...) engineCfgs := []*engine.Config{engineCfg} - if tc.multiEngine { - engineCfgs = append(engineCfgs, engineCfg) - } sCfg := config.DefaultServer().WithEngines(engineCfgs...) - // tests are for pre-engine-start scenario so pass notStarted: true - cs := mockControlService(t, log, sCfg, tc.bmbc, tc.smbc, nil, true) + var cs *ControlService + if tc.enginesNotReady { + cs = mockControlService(t, log, sCfg, nil, tc.smbc, nil, true) + } else { + cs = mockControlService(t, log, sCfg, nil, tc.smbc, nil) + } + + scanBdevs = func(_ context.Context, c *ControlService, _ *ctlpb.ScanNvmeReq, _ []*ctlpb.ScmNamespace) (*ctlpb.ScanNvmeResp, error) { + return tc.bdevScanRes, tc.bdevScanErr + } + defer func() { + scanBdevs = bdevScan + }() if tc.req == nil { tc.req = &ctlpb.StorageScanReq{ @@ -447,12 +767,6 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { return } - if tc.req.Nvme.Health || tc.req.Nvme.Meta { - if len(cs.harness.instances) == 0 { - tc.expResp.Nvme.Ctrlrs = nil - } - } - if diff := cmp.Diff(tc.expResp, resp, defStorageScanCmpOpts...); diff != "" { t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) } @@ -460,1071 +774,6 @@ func TestServer_CtlSvc_StorageScan_PreEngineStart(t *testing.T) { } } -//func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { -// const ( -// clusterSize uint64 = 32 * humanize.MiByte -// metaWalSize uint64 = 64 * humanize.MiByte -// rdbSize uint64 = defaultRdbSize -// rdbWalSize uint64 = 512 * humanize.MiByte -// ) -// -// adjustScmSize := func(sizeBytes uint64, withMdDaosScm bool) uint64 { -// mdBytes := rdbSize + mdFsScmBytes -// if withMdDaosScm { -// mdBytes += mdDaosScmBytes -// } -// -// if sizeBytes < mdBytes { -// return 0 -// } -// -// return sizeBytes - mdBytes -// } -// -// adjustNvmeSize := func(nvmeCtlr *ctlpb.NvmeController, mdBytes uint64, engineTargetCount int) *ctlpb.NvmeController { -// getClusterCount := func(sizeBytes uint64) uint64 { -// clusterCount := sizeBytes / clusterSize -// if sizeBytes%clusterSize != 0 { -// clusterCount += 1 -// } -// return clusterCount -// } -// -// type deviceSizeStat struct { -// clusterPerTarget uint64 -// smdDevs []*ctlpb.SmdDevice -// } -// devicesToAdjust := make(map[uint32]*deviceSizeStat, 0) -// for _, dev := range nvmeCtlr.GetSmdDevices() { -// targetCount := uint64(len(dev.GetTgtIds())) -// dev.MetaSize = adjustScmSize(mdBytes, false) / uint64(engineTargetCount) -// dev.AvailBytes = (dev.GetAvailBytes() / clusterSize) * clusterSize -// -// usableClusterCount := dev.GetAvailBytes() / clusterSize -// usableClusterCount -= getClusterCount(dev.MetaSize) * uint64(engineTargetCount) -// usableClusterCount -= getClusterCount(metaWalSize) * uint64(engineTargetCount) -// usableClusterCount -= getClusterCount(rdbSize) -// usableClusterCount -= getClusterCount(rdbWalSize) -// -// rank := dev.GetRank() -// if devicesToAdjust[rank] == nil { -// devicesToAdjust[rank] = &deviceSizeStat{ -// clusterPerTarget: math.MaxUint64, -// } -// } -// devicesToAdjust[rank].smdDevs = append(devicesToAdjust[rank].smdDevs, dev) -// clusterPerTarget := usableClusterCount / targetCount -// if clusterPerTarget < devicesToAdjust[rank].clusterPerTarget { -// devicesToAdjust[rank].clusterPerTarget = clusterPerTarget -// } -// } -// -// for _, item := range devicesToAdjust { -// for _, dev := range item.smdDevs { -// targetCount := uint64(len(dev.GetTgtIds())) -// dev.UsableBytes = item.clusterPerTarget * targetCount * clusterSize -// } -// } -// -// return nvmeCtlr -// } -// -// // output to be returned from mock bdev backend -// newCtrlr := func(idx int32) *storage.NvmeController { -// ctrlr := storage.MockNvmeController(idx) -// ctrlr.Serial = test.MockUUID(idx) -// ctrlr.SmdDevices = nil -// -// return ctrlr -// } -// newCtrlrMultiNs := func(idx int32, numNss int) *storage.NvmeController { -// ctrlr := storage.MockNvmeController(idx) -// ctrlr.Serial = test.MockUUID(idx) -// ctrlr.SmdDevices = nil -// ctrlr.Namespaces = make([]*storage.NvmeNamespace, numNss) -// for i := 0; i < numNss; i++ { -// ctrlr.Namespaces[i] = storage.MockNvmeNamespace(int32(i + 1)) -// } -// -// return ctrlr -// } -// -// // expected protobuf output to be returned svc.StorageScan when health -// // updated over drpc. Override serial uuid with variable argument -// newCtrlrHealth := func(idx int32, serialIdx ...int32) (*ctlpb.NvmeController, *ctlpb.BioHealthResp) { -// ctrlr := proto.MockNvmeController(idx) -// sIdx := idx -// if len(serialIdx) > 0 { -// sIdx = serialIdx[0] -// } -// ctrlr.Model = fmt.Sprintf("model-%d", sIdx) -// ctrlr.Serial = test.MockUUID(sIdx) -// ctrlr.HealthStats = proto.MockNvmeHealth(idx + 1) -// ctrlr.HealthStats.ClusterSize = clusterSize -// ctrlr.HealthStats.MetaWalSize = metaWalSize -// ctrlr.HealthStats.RdbWalSize = rdbWalSize -// ctrlr.SmdDevices = nil -// -// bioHealthResp := new(ctlpb.BioHealthResp) -// if err := convert.Types(ctrlr.HealthStats, bioHealthResp); err != nil { -// t.Fatal(err) -// } -// bioHealthResp.TotalBytes = uint64(idx) * uint64(humanize.TByte) -// bioHealthResp.AvailBytes = uint64(idx) * uint64(humanize.TByte/2) -// bioHealthResp.ClusterSize = clusterSize -// bioHealthResp.MetaWalSize = metaWalSize -// bioHealthResp.RdbWalSize = rdbWalSize -// -// return ctrlr, bioHealthResp -// } -// newCtrlrPBwHealth := func(idx int32, serialIdx ...int32) *ctlpb.NvmeController { -// c, _ := newCtrlrHealth(idx, serialIdx...) -// return c -// } -// newBioHealthResp := func(idx int32, serialIdx ...int32) *ctlpb.BioHealthResp { -// _, b := newCtrlrHealth(idx, serialIdx...) -// return b -// } -// -// // expected protobuf output to be returned svc.StorageScan when smd -// // updated over drpc -// newCtrlrMeta := func(ctrlrIdx int32, smdIndexes ...int32) (*ctlpb.NvmeController, *ctlpb.SmdDevResp) { -// ctrlr := proto.MockNvmeController(ctrlrIdx) -// ctrlr.Serial = test.MockUUID(ctrlrIdx) -// ctrlr.HealthStats = nil -// -// if len(smdIndexes) == 0 { -// smdIndexes = append(smdIndexes, ctrlrIdx) -// } -// smdDevRespDevices := make([]*ctlpb.SmdDevice, len(smdIndexes)) -// ctrlr.SmdDevices = make([]*ctlpb.SmdDevice, len(smdIndexes)) -// ctrlr.Namespaces = make([]*ctlpb.NvmeController_Namespace, len(smdIndexes)) -// for i, idx := range smdIndexes { -// sd := proto.MockSmdDevice(ctrlr.PciAddr, idx+1) -// sd.DevState = devStateNormal -// sd.Rank = uint32(ctrlrIdx) -// sd.TrAddr = ctrlr.PciAddr -// ctrlr.SmdDevices[i] = sd -// -// smdPB := new(ctlpb.SmdDevice) -// if err := convert.Types(sd, smdPB); err != nil { -// t.Fatal(err) -// } -// smdDevRespDevices[i] = smdPB -// -// // expect resultant controller to have updated utilization values -// ctrlr.SmdDevices[i].TotalBytes = uint64(idx) * uint64(humanize.TByte) -// ctrlr.SmdDevices[i].AvailBytes = uint64(idx) * uint64(humanize.TByte/2) -// ctrlr.SmdDevices[i].ClusterSize = clusterSize -// ctrlr.SmdDevices[i].MetaWalSize = metaWalSize -// ctrlr.SmdDevices[i].RdbSize = rdbSize -// ctrlr.SmdDevices[i].RdbWalSize = rdbWalSize -// ctrlr.Namespaces[i] = proto.MockNvmeNamespace(int32(i + 1)) -// } -// -// return ctrlr, &ctlpb.SmdDevResp{Devices: smdDevRespDevices} -// } -// newCtrlrPB := func(idx int32) *ctlpb.NvmeController { -// c, _ := newCtrlrMeta(idx) -// c.SmdDevices = nil -// return c -// } -// newCtrlrPBwBasic := func(idx int32) *ctlpb.NvmeController { -// c := newCtrlrPB(idx) -// c.FwRev = "" -// c.Model = "" -// return c -// } -// newCtrlrPBwMeta := func(idx int32, smdIndexes ...int32) *ctlpb.NvmeController { -// c, _ := newCtrlrMeta(idx, smdIndexes...) -// return c -// } -// newSmdDevResp := func(idx int32, smdIndexes ...int32) *ctlpb.SmdDevResp { -// _, s := newCtrlrMeta(idx, smdIndexes...) -// return s -// } -// -// smdDevRespStateNew := newSmdDevResp(1) -// smdDevRespStateNew.Devices[0].DevState = devStateNew -// smdDevRespStateNew.Devices[0].ClusterSize = 0 -// smdDevRespStateNew.Devices[0].MetaWalSize = 0 -// smdDevRespStateNew.Devices[0].RdbWalSize = 0 -// -// ctrlrPBwMetaNew := newCtrlrPBwMeta(1) -// ctrlrPBwMetaNew.SmdDevices[0].AvailBytes = 0 -// ctrlrPBwMetaNew.SmdDevices[0].TotalBytes = 0 -// ctrlrPBwMetaNew.SmdDevices[0].DevState = devStateNew -// ctrlrPBwMetaNew.SmdDevices[0].ClusterSize = 0 -// ctrlrPBwMetaNew.SmdDevices[0].UsableBytes = 0 -// ctrlrPBwMetaNew.SmdDevices[0].RdbSize = 0 -// ctrlrPBwMetaNew.SmdDevices[0].RdbWalSize = 0 -// ctrlrPBwMetaNew.SmdDevices[0].MetaSize = 0 -// ctrlrPBwMetaNew.SmdDevices[0].MetaWalSize = 0 -// -// ctrlrPBwMetaNormal := newCtrlrPBwMeta(1) -// ctrlrPBwMetaNormal.SmdDevices[0].AvailBytes = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].TotalBytes = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].DevState = devStateNormal -// ctrlrPBwMetaNormal.SmdDevices[0].ClusterSize = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].UsableBytes = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].RdbSize = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].RdbWalSize = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].MetaSize = 0 -// ctrlrPBwMetaNormal.SmdDevices[0].MetaWalSize = 0 -// -// mockPbScmMount0 := proto.MockScmMountPoint(0) -// mockPbScmMount0.Rank += 1 -// mockPbScmNamespace0 := proto.MockScmNamespace(0) -// mockPbScmNamespace0.Mount = mockPbScmMount0 -// mockPbScmMount1 := proto.MockScmMountPoint(1) -// mockPbScmMount1.Rank += 1 -// mockPbScmNamespace1 := proto.MockScmNamespace(1) -// mockPbScmNamespace1.Mount = mockPbScmMount1 -// -// for name, tc := range map[string]struct { -// req *ctlpb.StorageScanReq -// csCtrlrs *storage.NvmeControllers // control service storage provider -// eCtrlrs []*storage.NvmeControllers // engine storage provider -// smbc *scm.MockBackendConfig -// smsc *system.MockSysConfig -// storageCfgs []storage.TierConfigs -// engineTargetCount []int -// enginesNotReady bool -// scanTwice bool -// junkResp bool -// drpcResps map[int][]*mockDrpcResponse -// expErr error -// expResp *ctlpb.StorageScanResp -// }{ -// "engine up; scan bdev basic": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Basic: true}, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: {}, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{newCtrlrPBwBasic(1)}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engine up; scan bdev basic; no bdevs in config": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Basic: true}, -// }, -// storageCfgs: []storage.TierConfigs{}, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{newCtrlrPBwBasic(1)}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engine up; scan bdev basic; missing bdev in config": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Basic: true}, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(2)}, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: {}, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engine up; scan bdev health": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Health: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// {Message: newBioHealthResp(1)}, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{newCtrlrPBwHealth(1)}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engine up; scan bdev meta": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{ -// storage.MockScmModule(0), -// }, -// GetNamespacesRes: storage.ScmNamespaces{ -// storage.MockScmNamespace(0), -// }, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList(mockPbScmNamespace0.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// {Message: newBioHealthResp(1)}, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ -// adjustNvmeSize(newCtrlrPBwMeta(1), mockPbScmMount0.AvailBytes, 4), -// }, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// Namespaces: proto.ScmNamespaces{ -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace0.Blockdev, -// Dev: mockPbScmNamespace0.Dev, -// Size: mockPbScmNamespace0.Size, -// Uuid: mockPbScmNamespace0.Uuid, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount0.Class, -// DeviceList: mockPbScmMount0.DeviceList, -// Path: mockPbScmMount0.Path, -// TotalBytes: mockPbScmMount0.TotalBytes, -// AvailBytes: mockPbScmMount0.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount0.AvailBytes, false), -// Rank: mockPbScmMount0.Rank, -// }, -// }, -// }, -// State: new(ctlpb.ResponseState), -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engines up; scan bdev health": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Health: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1), newCtrlr(2)}, -// eCtrlrs: []*storage.NvmeControllers{{newCtrlr(1)}, {newCtrlr(2)}}, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(2).PciAddr), -// }, -// }, -// engineTargetCount: []int{4, 4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// {Message: newBioHealthResp(1)}, -// }, -// 1: { -// {Message: newSmdDevResp(2)}, -// {Message: newBioHealthResp(2)}, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ -// newCtrlrPBwHealth(1), -// newCtrlrPBwHealth(2), -// }, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "engines up; scan bdev meta; multiple nvme namespaces": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{ -// newCtrlrMultiNs(1, 2), newCtrlrMultiNs(2, 2), -// }, -// eCtrlrs: []*storage.NvmeControllers{ -// {newCtrlrMultiNs(1, 2)}, {newCtrlrMultiNs(2, 2)}, -// }, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{ -// storage.MockScmModule(0), -// }, -// GetNamespacesRes: storage.ScmNamespaces{ -// storage.MockScmNamespace(0), -// storage.MockScmNamespace(1), -// }, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// { -// Total: mockPbScmMount1.TotalBytes, -// Avail: mockPbScmMount1.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList(mockPbScmNamespace0.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount1.Path). -// WithScmDeviceList(mockPbScmNamespace1.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(2).PciAddr), -// }, -// }, -// engineTargetCount: []int{8, 8}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1, 1, 2)}, -// {Message: newBioHealthResp(1, 1)}, -// {Message: newBioHealthResp(2, 1)}, -// }, -// 1: { -// {Message: newSmdDevResp(2, 3, 4)}, -// {Message: newBioHealthResp(3, 2)}, -// {Message: newBioHealthResp(4, 2)}, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ -// adjustNvmeSize(newCtrlrPBwMeta(1, 1, 2), mockPbScmMount0.AvailBytes, 8), -// adjustNvmeSize(newCtrlrPBwMeta(2, 3, 4), mockPbScmMount1.AvailBytes, 8), -// }, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// Namespaces: proto.ScmNamespaces{ -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace0.Blockdev, -// Dev: mockPbScmNamespace0.Dev, -// Size: mockPbScmNamespace0.Size, -// Uuid: mockPbScmNamespace0.Uuid, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount0.Class, -// DeviceList: mockPbScmMount0.DeviceList, -// Path: mockPbScmMount0.Path, -// TotalBytes: mockPbScmMount0.TotalBytes, -// AvailBytes: mockPbScmMount0.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount0.AvailBytes, false), -// Rank: mockPbScmMount0.Rank, -// }, -// }, -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace1.Blockdev, -// Dev: mockPbScmNamespace1.Dev, -// Size: mockPbScmNamespace1.Size, -// Uuid: mockPbScmNamespace1.Uuid, -// NumaNode: mockPbScmNamespace1.NumaNode, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount1.Class, -// DeviceList: mockPbScmMount1.DeviceList, -// Path: mockPbScmMount1.Path, -// TotalBytes: mockPbScmMount1.TotalBytes, -// AvailBytes: mockPbScmMount1.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount1.AvailBytes, false), -// Rank: mockPbScmMount1.Rank, -// }, -// }, -// }, -// State: new(ctlpb.ResponseState), -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "scan scm usage": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: new(ctlpb.ScanNvmeReq), -// }, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{storage.MockScmModule(0)}, -// GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace(0)}, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList(mockPbScmNamespace0.Blockdev), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: {}, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// Namespaces: proto.ScmNamespaces{ -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace0.Blockdev, -// Dev: mockPbScmNamespace0.Dev, -// Size: mockPbScmNamespace0.Size, -// Uuid: mockPbScmNamespace0.Uuid, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount0.Class, -// DeviceList: mockPbScmMount0.DeviceList, -// Path: mockPbScmMount0.Path, -// Rank: mockPbScmMount0.Rank, -// TotalBytes: mockPbScmMount0.TotalBytes, -// AvailBytes: mockPbScmMount0.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount0.AvailBytes, true), -// }, -// }, -// }, -// State: new(ctlpb.ResponseState), -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "scan scm usage; pmem not in instance device list": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: new(ctlpb.ScanNvmeReq), -// }, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{storage.MockScmModule(0)}, -// GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace(0)}, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList("/dev/foo", "/dev/bar"), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: {}, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// State: &ctlpb.ResponseState{ -// Status: ctlpb.ResponseStatus_CTL_ERR_SCM, -// Error: "instance 0: no pmem namespace for mount /mnt/daos0", -// }, -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "scan scm usage; class ram": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: new(ctlpb.ScanNvmeReq), -// }, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{storage.MockScmModule(0)}, -// GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace(0)}, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassRam.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmRamdiskSize(16), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: {}, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// Namespaces: proto.ScmNamespaces{ -// &ctlpb.ScmNamespace{ -// Blockdev: "ramdisk", -// Size: uint64(humanize.GiByte * 16), -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: "ram", -// Path: mockPbScmMount0.Path, -// TotalBytes: mockPbScmMount0.TotalBytes, -// AvailBytes: mockPbScmMount0.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount0.AvailBytes, true), -// Rank: mockPbScmMount0.Rank, -// }, -// }, -// }, -// State: new(ctlpb.ResponseState), -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "multi-engine; multi-tier; with usage": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1), newCtrlr(2)}, -// eCtrlrs: []*storage.NvmeControllers{{newCtrlr(1)}, {newCtrlr(2)}}, -// smbc: &scm.MockBackendConfig{ -// GetModulesRes: storage.ScmModules{ -// storage.MockScmModule(0), -// }, -// GetNamespacesRes: storage.ScmNamespaces{ -// storage.MockScmNamespace(0), -// storage.MockScmNamespace(1), -// }, -// }, -// smsc: &system.MockSysConfig{ -// GetfsUsageResps: []system.GetfsUsageRetval{ -// { -// Total: mockPbScmMount0.TotalBytes, -// Avail: mockPbScmMount0.AvailBytes, -// }, -// { -// Total: mockPbScmMount1.TotalBytes, -// Avail: mockPbScmMount1.AvailBytes, -// }, -// }, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList(mockPbScmNamespace0.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount1.Path). -// WithScmDeviceList(mockPbScmNamespace1.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(2).PciAddr), -// }, -// }, -// engineTargetCount: []int{4, 4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// {Message: newBioHealthResp(1)}, -// }, -// 1: { -// {Message: newSmdDevResp(2)}, -// {Message: newBioHealthResp(2)}, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ -// adjustNvmeSize(newCtrlrPBwMeta(1), mockPbScmMount0.AvailBytes, 4), -// adjustNvmeSize(newCtrlrPBwMeta(2), mockPbScmMount1.AvailBytes, 4), -// }, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{ -// Namespaces: proto.ScmNamespaces{ -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace0.Blockdev, -// Dev: mockPbScmNamespace0.Dev, -// Size: mockPbScmNamespace0.Size, -// Uuid: mockPbScmNamespace0.Uuid, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount0.Class, -// DeviceList: mockPbScmMount0.DeviceList, -// Path: mockPbScmMount0.Path, -// TotalBytes: mockPbScmMount0.TotalBytes, -// AvailBytes: mockPbScmMount0.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount0.AvailBytes, false), -// Rank: mockPbScmMount0.Rank, -// }, -// }, -// &ctlpb.ScmNamespace{ -// Blockdev: mockPbScmNamespace1.Blockdev, -// Dev: mockPbScmNamespace1.Dev, -// Size: mockPbScmNamespace1.Size, -// Uuid: mockPbScmNamespace1.Uuid, -// NumaNode: mockPbScmNamespace1.NumaNode, -// Mount: &ctlpb.ScmNamespace_Mount{ -// Class: mockPbScmMount1.Class, -// DeviceList: mockPbScmMount1.DeviceList, -// Path: mockPbScmMount1.Path, -// TotalBytes: mockPbScmMount1.TotalBytes, -// AvailBytes: mockPbScmMount1.AvailBytes, -// UsableBytes: adjustScmSize(mockPbScmMount1.AvailBytes, false), -// Rank: mockPbScmMount1.Rank, -// }, -// }, -// }, -// State: new(ctlpb.ResponseState), -// }, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "multi-engine; multi-tier; with usage; engines not ready": { -// req: &ctlpb.StorageScanReq{ -// Scm: &ctlpb.ScanScmReq{Usage: true}, -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount0.Path). -// WithScmDeviceList(mockPbScmNamespace0.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassDcpm.String()). -// WithScmMountPoint(mockPbScmMount1.Path). -// WithScmDeviceList(mockPbScmNamespace1.Blockdev), -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(2).PciAddr), -// }, -// }, -// engineTargetCount: []int{4, 4}, -// enginesNotReady: true, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// {Message: newBioHealthResp(1)}, -// }, -// 1: { -// {Message: newSmdDevResp(2)}, -// {Message: newBioHealthResp(2)}, -// }, -// }, -// expErr: errEngineNotReady, -// }, -// // Sometimes when more than a few ssds are assigned to engine without many targets, -// // some of the smd entries for the latter ssds are in state "NEW" rather than -// // "NORMAL", when in this state, health is unavailable and DER_NONEXIST is returned. -// "bdev scan; meta; new state; non-existent smd health": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: smdDevRespStateNew}, -// { -// Message: &ctlpb.BioHealthResp{ -// Status: int32(daos.Nonexistent), -// }, -// }, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ctrlrPBwMetaNew}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "bdev scan; meta; new state; nomem smd health": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: smdDevRespStateNew}, -// { -// Message: &ctlpb.BioHealthResp{ -// Status: int32(daos.FreeMemError), -// }, -// }, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ctrlrPBwMetaNew}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// "bdev scan; meta; normal state; non-existent smd health": { -// req: &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: &ctlpb.ScanNvmeReq{Meta: true}, -// }, -// csCtrlrs: &storage.NvmeControllers{newCtrlr(1)}, -// storageCfgs: []storage.TierConfigs{ -// { -// storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(newCtrlr(1).PciAddr), -// }, -// }, -// engineTargetCount: []int{4}, -// drpcResps: map[int][]*mockDrpcResponse{ -// 0: { -// {Message: newSmdDevResp(1)}, -// { -// Message: &ctlpb.BioHealthResp{ -// Status: int32(daos.Nonexistent), -// }, -// }, -// }, -// }, -// expResp: &ctlpb.StorageScanResp{ -// Nvme: &ctlpb.ScanNvmeResp{ -// Ctrlrs: proto.NvmeControllers{ctrlrPBwMetaNormal}, -// State: new(ctlpb.ResponseState), -// }, -// Scm: &ctlpb.ScanScmResp{State: new(ctlpb.ResponseState)}, -// MemInfo: proto.MockPBMemInfo(), -// }, -// }, -// } { -// t.Run(name, func(t *testing.T) { -// log, buf := logging.NewTestLogger(t.Name()) -// defer test.ShowBufferOnFailure(t, buf) -// -// if len(tc.storageCfgs) != len(tc.drpcResps) { -// t.Fatalf("number of tc.storageCfgs doesn't match num drpc msg groups") -// } -// -// if len(tc.storageCfgs) == 1 && tc.eCtrlrs == nil && tc.csCtrlrs != nil { -// log.Debugf("using control service storage provider for first engine") -// tc.eCtrlrs = []*storage.NvmeControllers{tc.csCtrlrs} -// } -// -// var csbmbc *bdev.MockBackendConfig -// if tc.csCtrlrs != nil { -// log.Debugf("bdevs %v to be returned for control service scan", *tc.csCtrlrs) -// csbmbc = &bdev.MockBackendConfig{ -// ScanRes: &storage.BdevScanResponse{Controllers: *tc.csCtrlrs}, -// } -// } -// -// var engineCfgs []*engine.Config -// for i, sc := range tc.storageCfgs { -// log.Debugf("storage cfg contains bdevs %v for engine %d", sc.Bdevs(), i) -// engineCfgs = append(engineCfgs, -// engine.MockConfig(). -// WithStorage(sc...). -// WithTargetCount(tc.engineTargetCount[i])) -// } -// sCfg := config.DefaultServer().WithEngines(engineCfgs...) -// cs := mockControlService(t, log, sCfg, csbmbc, tc.smbc, tc.smsc) -// -// for idx, ec := range engineCfgs { -// var ebmbc *bdev.MockBackendConfig -// if tc.eCtrlrs != nil && len(tc.eCtrlrs) > idx { -// log.Debugf("bdevs %v to be returned for engine %d scan", -// *tc.eCtrlrs[idx], idx) -// ebmbc = &bdev.MockBackendConfig{ -// ScanRes: &storage.BdevScanResponse{ -// Controllers: *tc.eCtrlrs[idx], -// }, -// } -// } -// -// // replace harness instance with mock I/O Engine -// // to enable mocking of harness instance drpc channel -// sp := storage.MockProvider(log, idx, &ec.Storage, -// cs.storage.Sys, // share system provider cfo -// scm.NewMockProvider(log, tc.smbc, nil), -// bdev.NewMockProvider(log, ebmbc), nil) -// te := newTestEngine(log, false, sp, ec) -// -// if tc.enginesNotReady { -// te.ready.SetFalse() -// } -// -// // mock drpc responses -// dcc := new(mockDrpcClientConfig) -// if tc.junkResp { -// dcc.setSendMsgResponse(drpc.Status_SUCCESS, -// makeBadBytes(42), nil) -// } else if len(tc.drpcResps) > idx { -// t.Logf("setting %d drpc responses for engine %d", -// len(tc.drpcResps[idx]), idx) -// dcc.setSendMsgResponseList(t, tc.drpcResps[idx]...) -// } else { -// t.Fatal("drpc response mocks unpopulated") -// } -// te.setDrpcClient(newMockDrpcClient(dcc)) -// te._superblock.Rank = ranklist.NewRankPtr(uint32(idx + 1)) -// for _, tc := range te.storage.GetBdevConfigs() { -// tc.Bdev.DeviceRoles.OptionBits = storage.OptionBits(storage.BdevRoleAll) -// } -// md := te.storage.GetControlMetadata() -// md.Path = "/foo" -// md.DevicePath = md.Path -// -// cs.harness.instances[idx] = te -// } -// cs.harness.started.SetTrue() -// -// if tc.req == nil { -// tc.req = &ctlpb.StorageScanReq{ -// Scm: new(ctlpb.ScanScmReq), -// Nvme: new(ctlpb.ScanNvmeReq), -// } -// } -// -// if tc.scanTwice { -// _, err := cs.StorageScan(test.Context(t), tc.req) -// test.CmpErr(t, tc.expErr, err) -// if err != nil { -// return -// } -// } -// -// resp, err := cs.StorageScan(test.Context(t), tc.req) -// test.CmpErr(t, tc.expErr, err) -// if err != nil { -// return -// } -// -// if diff := cmp.Diff(tc.expResp, resp, defStorageScanCmpOpts...); diff != "" { -// t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) -// } -// }) -// } -//} - func TestServer_checkTmpfsMem(t *testing.T) { for name, tc := range map[string]struct { scmCfgs map[int]*storage.TierConfig @@ -1645,7 +894,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { bClass storage.Class bDevs [][]string bSize int - bmbc *bdev.MockBackendConfig + bmbcs []*bdev.MockBackendConfig awaitTimeout time.Duration getMemInfo func() (*common.MemInfo, error) expAwaitExit bool @@ -1657,6 +906,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassRam, sSize: 6, + bmbcs: []*bdev.MockBackendConfig{{}}, expResp: &ctlpb.StorageFormatResp{ Crets: []*ctlpb.NvmeControllerResult{}, Mrets: []*ctlpb.ScmMountResult{ @@ -1671,6 +921,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassDcpm, sDevs: []string{"/dev/pmem1"}, + bmbcs: []*bdev.MockBackendConfig{{}}, expResp: &ctlpb.StorageFormatResp{ Crets: []*ctlpb.NvmeControllerResult{}, Mrets: []*ctlpb.ScmMountResult{ @@ -1688,14 +939,16 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sSize: 6, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -1723,13 +976,15 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { bClass: storage.ClassFile, bDevs: [][]string{{"/tmp/daos-bdev"}}, bSize: 6, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - "/tmp/daos-bdev": new(storage.BdevDeviceFormatResponse), + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + "/tmp/daos-bdev": new(storage.BdevDeviceFormatResponse), + }, }, }, }, @@ -1754,14 +1009,16 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sDevs: []string{"dev/pmem0"}, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -1789,9 +1046,11 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sSize: 6, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, }, }, expAwaitExit: true, @@ -1828,9 +1087,11 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sSize: 6, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, }, }, expResp: &ctlpb.StorageFormatResp{ @@ -1862,14 +1123,16 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sSize: 6, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -1900,14 +1163,16 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sSize: 6, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -1934,9 +1199,11 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sDevs: []string{"/dev/pmem1"}, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, }, }, expResp: &ctlpb.StorageFormatResp{ @@ -1968,14 +1235,16 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sDevs: []string{"/dev/pmem1"}, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, - }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -2005,9 +1274,11 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sDevs: []string{"/dev/pmem1"}, bClass: storage.ClassNvme, bDevs: [][]string{{mockNvmeController0.PciAddr}}, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, }, }, expAwaitExit: true, @@ -2030,14 +1301,29 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { {mockNvmeController0.PciAddr}, {mockNvmeController1.PciAddr}, }, - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{mockNvmeController0, mockNvmeController1}, + // One for each engine. + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController0}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, + }, + }, }, - FormatRes: &storage.BdevFormatResponse{ - DeviceResponses: storage.BdevDeviceFormatResponses{ - mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ - Formatted: true, + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{mockNvmeController1}, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController1.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, }, }, }, @@ -2082,6 +1368,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { t.Fatal("expResp test case parameter required") } test.AssertEqual(t, len(tc.sMounts), len(tc.expResp.Mrets), name) + test.AssertEqual(t, len(tc.sMounts), len(tc.bmbcs), name) for i := range tc.sMounts { // Hack to deal with creating the mountpoint in test. // FIXME (DAOS-3471): The tests in this layer really shouldn't be @@ -2160,7 +1447,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { sysProv := system.NewMockSysProvider(log, smsc) mounter := mount.NewProvider(log, sysProv) scmProv := scm.NewProvider(log, nil, sysProv, mounter) - bdevProv := bdev.NewMockProvider(log, tc.bmbc) + bdevProv := bdev.NewMockProvider(log, nil) if tc.getMemInfo == nil { tc.getMemInfo = func() (*common.MemInfo, error) { return &common.MemInfo{ @@ -2182,13 +1469,6 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { srvCfg: config, } - // Mimic control service start-up and engine creation where cache is shared - // to the engines from the base control service storage provider. - nvmeScanResp, err := cs.NvmeScan(storage.BdevScanRequest{}) - if err != nil { - t.Fatal(err) - } - for i, ec := range config.Engines { root := filepath.Dir(tc.sMounts[i]) if tc.scmMounted { @@ -2202,12 +1482,13 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { trc.Running.Store(tc.instancesStarted) runner := engine.NewTestRunner(trc, ec) - storProv := storage.MockProvider(log, 0, &ec.Storage, sysProv, - scmProv, bdevProv, nil) + // Engine specific bdev provider. + ebp := bdev.NewMockProvider(log, tc.bmbcs[i]) + esp := storage.MockProvider(log, 0, &ec.Storage, sysProv, + scmProv, ebp, nil) - ei := NewEngineInstance(log, storProv, nil, runner) + ei := NewEngineInstance(log, esp, nil, runner) ei.ready.Store(tc.instancesStarted) - ei.storage.SetBdevCache(*nvmeScanResp) // if the instance is expected to have a valid superblock, create one if tc.superblockExists { @@ -2730,712 +2011,734 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { } } -//func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { -// const ( -// clusterSize uint64 = 32 * humanize.MiByte -// hugeClusterSize uint64 = humanize.GiByte -// metaSize uint64 = 64 * humanize.MiByte -// metaWalSize uint64 = 128 * humanize.MiByte -// rdbSize uint64 = 256 * humanize.MiByte -// rdbWalSize uint64 = 512 * humanize.MiByte -// ) -// -// type StorageCfg struct { -// targetCount int -// tierCfgs storage.TierConfigs -// } -// type DataInput struct { -// storageCfgs []*StorageCfg -// scanNvmeResp *ctlpb.ScanNvmeResp -// } -// type ExpectedOutput struct { -// totalBytes []uint64 -// availableBytes []uint64 -// usableBytes []uint64 -// message string -// } -// -// newTierCfg := func(pciIdx int32) *storage.TierConfig { -// return storage.NewTierConfig(). -// WithStorageClass(storage.ClassNvme.String()). -// WithBdevDeviceList(test.MockPCIAddr(pciIdx)) -// } -// -// newNvmeCtlr := func(nvmeCtlr *ctlpb.NvmeController) *ctlpb.NvmeController { -// for _, smdDev := range nvmeCtlr.SmdDevices { -// smdDev.ClusterSize = clusterSize -// smdDev.MetaSize = metaSize -// smdDev.MetaWalSize = metaWalSize -// smdDev.RdbSize = rdbSize -// smdDev.RdbWalSize = rdbWalSize -// } -// -// return nvmeCtlr -// } -// -// for name, tc := range map[string]struct { -// input DataInput -// output ExpectedOutput -// }{ -// "homogeneous": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 12, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// newTierCfg(3), -// }, -// }, -// { -// targetCount: 6, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(4), -// newTierCfg(5), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{4, 5, 6, 7}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(3), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme2", -// TgtIds: []int32{8, 9, 10, 11}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(4), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme3", -// TgtIds: []int32{0, 1, 2}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 1, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(5), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme4", -// TgtIds: []int32{3, 4, 5}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 1, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 8 * hugeClusterSize, -// 8 * hugeClusterSize, -// 18 * hugeClusterSize, -// 18 * hugeClusterSize, -// }, -// }, -// }, -// "heterogeneous": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 11, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// newTierCfg(3), -// }, -// }, -// { -// targetCount: 5, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(4), -// newTierCfg(5), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{4, 5, 6}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(3), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme2", -// TgtIds: []int32{7, 8, 9, 10}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(4), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme3", -// TgtIds: []int32{0, 1, 2}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 1, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(5), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme4", -// TgtIds: []int32{3, 4}, -// TotalBytes: 20 * hugeClusterSize, -// AvailBytes: 20 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 1, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// 20 * hugeClusterSize, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 6 * hugeClusterSize, -// 8 * hugeClusterSize, -// 18 * hugeClusterSize, -// 12 * hugeClusterSize, -// }, -// }, -// }, -// "new": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 7, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{0, 1, 2}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// DevState: devStateNew, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 0, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 0, -// }, -// message: "not usable: device state \"NEW\"", -// }, -// }, -// "evicted": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 7, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{0, 1, 2}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// DevState: devStateFaulty, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 0, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 0, -// }, -// message: "not usable: device state \"EVICTED\"", -// }, -// }, -// "missing targets": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 4, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 0, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 0, -// }, -// message: "not usable: missing storage info", -// }, -// }, -// "missing cluster size": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 7, -// tierCfgs: storage.TierConfigs{ -// newTierCfg(1), -// newTierCfg(2), -// }, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// { -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// ClusterSize: hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// { -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{0, 1, 2}, -// TotalBytes: 10 * hugeClusterSize, -// AvailBytes: 10 * hugeClusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData, -// }, -// }, -// }, -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 10 * hugeClusterSize, -// 10 * hugeClusterSize, -// }, -// availableBytes: []uint64{ -// 10 * hugeClusterSize, -// 0, -// }, -// usableBytes: []uint64{ -// 8 * hugeClusterSize, -// 0, -// }, -// message: "not usable: missing storage info", -// }, -// }, -// "multi bdev tier": { -// input: DataInput{ -// storageCfgs: []*StorageCfg{ -// { -// targetCount: 5, -// tierCfgs: storage.TierConfigs{newTierCfg(1)}, -// }, -// { -// targetCount: 4, -// tierCfgs: storage.TierConfigs{newTierCfg(2)}, -// }, -// { -// targetCount: 6, -// tierCfgs: storage.TierConfigs{newTierCfg(3)}, -// }, -// { -// targetCount: 4, -// tierCfgs: storage.TierConfigs{newTierCfg(4)}, -// }, -// { -// targetCount: 5, -// tierCfgs: storage.TierConfigs{newTierCfg(5)}, -// }, -// { -// targetCount: 6, -// tierCfgs: storage.TierConfigs{newTierCfg(6)}, -// }, -// }, -// scanNvmeResp: &ctlpb.ScanNvmeResp{ -// Ctrlrs: []*ctlpb.NvmeController{ -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(1), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme0", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 0, -// RoleBits: storage.BdevRoleData | storage.BdevRoleMeta, -// }, -// }, -// }), -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(2), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme1", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 1, -// RoleBits: storage.BdevRoleData | storage.BdevRoleWAL, -// }, -// }, -// }), -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(3), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme2", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 2, -// RoleBits: storage.BdevRoleAll, -// }, -// }, -// }), -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(4), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme3", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 3, -// RoleBits: storage.BdevRoleWAL, -// }, -// }, -// }), -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(5), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme4", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 4, -// RoleBits: storage.BdevRoleMeta, -// }, -// }, -// }), -// newNvmeCtlr(&ctlpb.NvmeController{ -// PciAddr: test.MockPCIAddr(6), -// SmdDevices: []*ctlpb.SmdDevice{ -// { -// Uuid: "nvme5", -// TgtIds: []int32{0, 1, 2, 3}, -// TotalBytes: 10 * humanize.GiByte, -// AvailBytes: 10 * humanize.GiByte, -// ClusterSize: clusterSize, -// Rank: 5, -// RoleBits: storage.BdevRoleMeta | storage.BdevRoleMeta, -// }, -// }, -// }), -// }, -// }, -// }, -// output: ExpectedOutput{ -// totalBytes: []uint64{ -// 320 * clusterSize, -// 320 * clusterSize, -// 320 * clusterSize, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// }, -// availableBytes: []uint64{ -// 320 * clusterSize, -// 320 * clusterSize, -// 320 * clusterSize, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// }, -// usableBytes: []uint64{ -// 300 * clusterSize, -// 288 * clusterSize, -// 260 * clusterSize, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// 0 * humanize.GiByte, -// }, -// }, -// }, -// } { -// t.Run(name, func(t *testing.T) { -// log, buf := logging.NewTestLogger(t.Name()) -// defer test.ShowBufferOnFailure(t, buf) -// -// engineCfgs := []*engine.Config{} -// for idx, sc := range tc.input.storageCfgs { -// ec := engine.MockConfig().WithStorage(sc.tierCfgs...) -// ec.TargetCount = sc.targetCount -// ec.Index = uint32(idx) -// engineCfgs = append(engineCfgs, ec) -// } -// serverCfg := config.DefaultServer().WithEngines(engineCfgs...) -// cs := mockControlService(t, log, serverCfg, nil, nil, nil) -// -// cs.adjustNvmeSize(tc.input.scanNvmeResp) -// -// for idx, ctlr := range tc.input.scanNvmeResp.GetCtrlrs() { -// dev := ctlr.GetSmdDevices()[0] -// test.AssertEqual(t, tc.output.totalBytes[idx], dev.GetTotalBytes(), -// fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): wait=%d, got=%d", -// ctlr.GetPciAddr(), idx, tc.output.totalBytes[idx], dev.GetTotalBytes())) -// test.AssertEqual(t, tc.output.availableBytes[idx], dev.GetAvailBytes(), -// fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): wait=%d, got=%d", -// ctlr.GetPciAddr(), idx, tc.output.availableBytes[idx], dev.GetAvailBytes())) -// test.AssertEqual(t, tc.output.usableBytes[idx], dev.GetUsableBytes(), -// fmt.Sprintf("Invalid usable bytes with ctlr %s (index=%d), "+ -// "wait=%d (%d clusters) got=%d (%d clusters)", -// ctlr.GetPciAddr(), idx, -// tc.output.usableBytes[idx], tc.output.usableBytes[idx]/clusterSize, -// dev.GetUsableBytes(), dev.GetUsableBytes()/clusterSize)) -// } -// if tc.output.message != "" { -// test.AssertTrue(t, -// strings.Contains(buf.String(), tc.output.message), -// "missing message: "+tc.output.message) -// } -// }) -// } -//} +func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { + const ( + clusterSize uint64 = 32 * humanize.MiByte + hugeClusterSize uint64 = humanize.GiByte + metaSize uint64 = 64 * humanize.MiByte + metaWalSize uint64 = 128 * humanize.MiByte + rdbSize uint64 = 256 * humanize.MiByte + rdbWalSize uint64 = 512 * humanize.MiByte + ) + + type StorageCfg struct { + targetCount int + tierCfgs storage.TierConfigs + } + type DataInput struct { + storageCfgs []*StorageCfg + scanNvmeResp *ctlpb.ScanNvmeResp + } + type ExpectedOutput struct { + totalBytes []uint64 + availableBytes []uint64 + usableBytes []uint64 + message string + } + + newTierCfg := func(pciIdx int32) *storage.TierConfig { + return storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(pciIdx)) + } + + newNvmeCtlr := func(nvmeCtlr *ctlpb.NvmeController) *ctlpb.NvmeController { + for _, smdDev := range nvmeCtlr.SmdDevices { + smdDev.ClusterSize = clusterSize + smdDev.MetaSize = metaSize + smdDev.MetaWalSize = metaWalSize + smdDev.RdbSize = rdbSize + smdDev.RdbWalSize = rdbWalSize + } + + return nvmeCtlr + } + + for name, tc := range map[string]struct { + input DataInput + output ExpectedOutput + }{ + "homogeneous": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 12, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + newTierCfg(3), + }, + }, + { + targetCount: 6, + tierCfgs: storage.TierConfigs{ + newTierCfg(4), + newTierCfg(5), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{4, 5, 6, 7}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(3), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme2", + TgtIds: []int32{8, 9, 10, 11}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(4), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme3", + TgtIds: []int32{0, 1, 2}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 1, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(5), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme4", + TgtIds: []int32{3, 4, 5}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 1, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 8 * hugeClusterSize, + 8 * hugeClusterSize, + 18 * hugeClusterSize, + 18 * hugeClusterSize, + }, + }, + }, + "heterogeneous": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 11, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + newTierCfg(3), + }, + }, + { + targetCount: 5, + tierCfgs: storage.TierConfigs{ + newTierCfg(4), + newTierCfg(5), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{4, 5, 6}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(3), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme2", + TgtIds: []int32{7, 8, 9, 10}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(4), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme3", + TgtIds: []int32{0, 1, 2}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 1, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(5), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme4", + TgtIds: []int32{3, 4}, + TotalBytes: 20 * hugeClusterSize, + AvailBytes: 20 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 1, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + 20 * hugeClusterSize, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 6 * hugeClusterSize, + 8 * hugeClusterSize, + 18 * hugeClusterSize, + 12 * hugeClusterSize, + }, + }, + }, + "new": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 7, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{0, 1, 2}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNew, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 0, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 0, + }, + message: "not usable: device state \"NEW\"", + }, + }, + "evicted": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 7, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{0, 1, 2}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateFaulty, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 0, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 0, + }, + message: "not usable: device state \"EVICTED\"", + }, + }, + "missing targets": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 4, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 0, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 0, + }, + message: "not usable: missing storage info", + }, + }, + "missing cluster size": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 7, + tierCfgs: storage.TierConfigs{ + newTierCfg(1), + newTierCfg(2), + }, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + { + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + ClusterSize: hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + { + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{0, 1, 2}, + TotalBytes: 10 * hugeClusterSize, + AvailBytes: 10 * hugeClusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData, + }, + }, + DevState: devStateNormal, + }, + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 10 * hugeClusterSize, + 10 * hugeClusterSize, + }, + availableBytes: []uint64{ + 10 * hugeClusterSize, + 0, + }, + usableBytes: []uint64{ + 8 * hugeClusterSize, + 0, + }, + message: "not usable: missing storage info", + }, + }, + "multi bdev tier": { + input: DataInput{ + storageCfgs: []*StorageCfg{ + { + targetCount: 5, + tierCfgs: storage.TierConfigs{newTierCfg(1)}, + }, + { + targetCount: 4, + tierCfgs: storage.TierConfigs{newTierCfg(2)}, + }, + { + targetCount: 6, + tierCfgs: storage.TierConfigs{newTierCfg(3)}, + }, + { + targetCount: 4, + tierCfgs: storage.TierConfigs{newTierCfg(4)}, + }, + { + targetCount: 5, + tierCfgs: storage.TierConfigs{newTierCfg(5)}, + }, + { + targetCount: 6, + tierCfgs: storage.TierConfigs{newTierCfg(6)}, + }, + }, + scanNvmeResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: []*ctlpb.NvmeController{ + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(1), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme0", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 0, + RoleBits: storage.BdevRoleData | storage.BdevRoleMeta, + }, + }, + DevState: devStateNormal, + }), + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(2), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme1", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 1, + RoleBits: storage.BdevRoleData | storage.BdevRoleWAL, + }, + }, + DevState: devStateNormal, + }), + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(3), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme2", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 2, + RoleBits: storage.BdevRoleAll, + }, + }, + DevState: devStateNormal, + }), + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(4), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme3", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 3, + RoleBits: storage.BdevRoleWAL, + }, + }, + DevState: devStateNormal, + }), + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(5), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme4", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 4, + RoleBits: storage.BdevRoleMeta, + }, + }, + DevState: devStateNormal, + }), + newNvmeCtlr(&ctlpb.NvmeController{ + PciAddr: test.MockPCIAddr(6), + SmdDevices: []*ctlpb.SmdDevice{ + { + Uuid: "nvme5", + TgtIds: []int32{0, 1, 2, 3}, + TotalBytes: 10 * humanize.GiByte, + AvailBytes: 10 * humanize.GiByte, + ClusterSize: clusterSize, + Rank: 5, + RoleBits: storage.BdevRoleMeta | storage.BdevRoleMeta, + }, + }, + DevState: devStateNormal, + }), + }, + }, + }, + output: ExpectedOutput{ + totalBytes: []uint64{ + 320 * clusterSize, + 320 * clusterSize, + 320 * clusterSize, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + }, + availableBytes: []uint64{ + 320 * clusterSize, + 320 * clusterSize, + 320 * clusterSize, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + }, + usableBytes: []uint64{ + 300 * clusterSize, + 288 * clusterSize, + 260 * clusterSize, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + 0 * humanize.GiByte, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + engineCfgs := []*engine.Config{} + for idx, sc := range tc.input.storageCfgs { + ec := engine.MockConfig().WithStorage(sc.tierCfgs...) + ec.TargetCount = sc.targetCount + ec.Index = uint32(idx) + engineCfgs = append(engineCfgs, ec) + } + serverCfg := config.DefaultServer().WithEngines(engineCfgs...) + cs := mockControlService(t, log, serverCfg, nil, nil, nil) + + cs.adjustNvmeSize(tc.input.scanNvmeResp) + + for idx, ctlr := range tc.input.scanNvmeResp.GetCtrlrs() { + dev := ctlr.GetSmdDevices()[0] + test.AssertEqual(t, tc.output.totalBytes[idx], dev.GetTotalBytes(), + fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): wait=%d, got=%d", + ctlr.GetPciAddr(), idx, tc.output.totalBytes[idx], dev.GetTotalBytes())) + test.AssertEqual(t, tc.output.availableBytes[idx], dev.GetAvailBytes(), + fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): wait=%d, got=%d", + ctlr.GetPciAddr(), idx, tc.output.availableBytes[idx], dev.GetAvailBytes())) + test.AssertEqual(t, tc.output.usableBytes[idx], dev.GetUsableBytes(), + fmt.Sprintf("Invalid usable bytes with ctlr %s (index=%d), "+ + "wait=%d (%d clusters) got=%d (%d clusters)", + ctlr.GetPciAddr(), idx, + tc.output.usableBytes[idx], tc.output.usableBytes[idx]/clusterSize, + dev.GetUsableBytes(), dev.GetUsableBytes()/clusterSize)) + } + if tc.output.message != "" { + test.AssertTrue(t, + strings.Contains(buf.String(), tc.output.message), + "missing message: "+tc.output.message) + } + }) + } +} func TestServer_getRdbSize(t *testing.T) { type ExpectedOutput struct { diff --git a/src/control/server/ctl_svc_test.go b/src/control/server/ctl_svc_test.go index 11995e06671..cb87c590788 100644 --- a/src/control/server/ctl_svc_test.go +++ b/src/control/server/ctl_svc_test.go @@ -22,27 +22,20 @@ import ( "github.com/daos-stack/daos/src/control/server/storage/scm" ) -// mockControlService takes cfgs for tuneable scm and sys provider behavior but -// default nvmeStorage behavior (cs.nvoe can be subsequently replaced in test). -func mockControlService(t *testing.T, log logging.Logger, cfg *config.Server, bmbc *bdev.MockBackendConfig, smbc *scm.MockBackendConfig, smsc *system.MockSysConfig, notStarted ...bool) *ControlService { +func newMockControlServiceFromBackends(t *testing.T, log logging.Logger, cfg *config.Server, bmb *bdev.MockBackend, smb *scm.MockBackend, smsc *system.MockSysConfig, notStarted ...bool) *ControlService { t.Helper() - started := true - if len(notStarted) > 0 && notStarted[0] { - started = false - } - if cfg == nil { cfg = config.DefaultServer().WithEngines(engine.MockConfig().WithTargetCount(1)) } - // share sys provider between engines to be able to access to same mock config data - sysProv := system.NewMockSysProvider(log, smsc) - mounter := mount.NewProvider(log, sysProv) - scmProv := scm.NewProvider(log, scm.NewMockBackend(smbc), sysProv, mounter) - bdevProv := bdev.NewMockProvider(log, bmbc) + // Share sys provider between engines to be able to access to same mock config data. + bp := bdev.NewProvider(log, bmb) + syp := system.NewMockSysProvider(log, smsc) + mp := mount.NewProvider(log, syp) + sp := scm.NewProvider(log, smb, syp, mp) - mscs := NewMockStorageControlService(log, cfg.Engines, sysProv, scmProv, bdevProv, nil) + mscs := NewMockStorageControlService(log, cfg.Engines, syp, sp, bp, nil) cs := &ControlService{ StorageControlService: *mscs, @@ -51,20 +44,40 @@ func mockControlService(t *testing.T, log logging.Logger, cfg *config.Server, bm srvCfg: cfg, } + started := make([]bool, len(cfg.Engines)) + for idx := range started { + started[idx] = true + } + switch len(notStarted) { + case 0: // Not specified so start all engines. + case 1: + if notStarted[0] { + // If single true notStarted bool, don't start any engines. + for idx := range started { + started[idx] = false + } + } + case len(cfg.Engines): // One notStarted bool specified for each engine. + for idx := range started { + started[idx] = !notStarted[idx] + } + default: + t.Fatal("len notStarted != len cfg.Engines") + } + for idx, ec := range cfg.Engines { trc := new(engine.TestRunnerConfig) - if started { + if started[idx] { trc.Running.SetTrue() } runner := engine.NewTestRunner(trc, ec) - storProv := storage.MockProvider(log, 0, &ec.Storage, sysProv, scmProv, bdevProv, - nil) + storProv := storage.MockProvider(log, 0, &ec.Storage, syp, sp, bp, nil) ei := NewEngineInstance(log, storProv, nil, runner) ei.setSuperblock(&Superblock{ Rank: ranklist.NewRankPtr(uint32(idx)), }) - if started { + if started[idx] { ei.ready.SetTrue() } if err := cs.harness.AddInstance(ei); err != nil { @@ -74,3 +87,14 @@ func mockControlService(t *testing.T, log logging.Logger, cfg *config.Server, bm return cs } + +// mockControlService takes cfgs for tuneable scm and sys provider behavior but +// default nvmeStorage behavior. +func mockControlService(t *testing.T, log logging.Logger, cfg *config.Server, bmbc *bdev.MockBackendConfig, smbc *scm.MockBackendConfig, smsc *system.MockSysConfig, notStarted ...bool) *ControlService { + t.Helper() + + bmb := bdev.NewMockBackend(bmbc) + smb := scm.NewMockBackend(smbc) + + return newMockControlServiceFromBackends(t, log, cfg, bmb, smb, smsc, notStarted...) +} diff --git a/src/control/server/harness.go b/src/control/server/harness.go index be53fdd262e..ae90d3ed711 100644 --- a/src/control/server/harness.go +++ b/src/control/server/harness.go @@ -15,7 +15,6 @@ import ( "github.com/pkg/errors" "google.golang.org/protobuf/proto" - commonpb "github.com/daos-stack/daos/src/control/common/proto" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" srvpb "github.com/daos-stack/daos/src/control/common/proto/srv" "github.com/daos-stack/daos/src/control/drpc" @@ -37,7 +36,6 @@ type Engine interface { newCret(string, error) *ctlpb.NvmeControllerResult tryDrpc(context.Context, drpc.Method) *system.MemberResult requestStart(context.Context) - updateInUseBdevs(context.Context, []storage.NvmeController, uint64, uint64) ([]storage.NvmeController, error) isAwaitingFormat() bool // These methods should probably be replaced by callbacks. @@ -46,11 +44,7 @@ type Engine interface { // These methods should probably be refactored out into functions that // accept the engine instance as a parameter. - GetBioHealth(context.Context, *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) - ScanBdevTiers() ([]storage.BdevTierScanResult, error) - ListSmdDevices(context.Context, *ctlpb.SmdDevReq) (*ctlpb.SmdDevResp, error) StorageFormatSCM(context.Context, bool) *ctlpb.ScmMountResult - StorageFormatNVMe() commonpb.NvmeControllerResults // This is a more reasonable surface that will be easier to maintain and test. CallDrpc(context.Context, drpc.Method, proto.Message) (*drpc.Response, error) @@ -67,6 +61,7 @@ type Engine interface { OnInstanceExit(...onInstanceExitFn) OnReady(...onReadyFn) GetStorage() *storage.Provider + Debugf(format string, args ...interface{}) } // EngineHarness is responsible for managing Engine instances. diff --git a/src/control/server/instance.go b/src/control/server/instance.go index 1d7f78743ee..b32831f03b9 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -362,3 +362,7 @@ func (ei *EngineInstance) callSetUp(ctx context.Context) error { return nil } + +func (ei *EngineInstance) Debugf(format string, args ...interface{}) { + ei.log.Debugf(format, args...) +} diff --git a/src/control/server/instance_drpc.go b/src/control/server/instance_drpc.go index 76d56678092..d516ba7e636 100644 --- a/src/control/server/instance_drpc.go +++ b/src/control/server/instance_drpc.go @@ -11,19 +11,16 @@ import ( "fmt" "time" - "github.com/dustin/go-humanize" "github.com/pkg/errors" "google.golang.org/protobuf/proto" "github.com/daos-stack/daos/src/control/build" - "github.com/daos-stack/daos/src/control/common/proto/convert" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" srvpb "github.com/daos-stack/daos/src/control/common/proto/srv" "github.com/daos-stack/daos/src/control/drpc" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/ranklist" - "github.com/daos-stack/daos/src/control/server/storage" "github.com/daos-stack/daos/src/control/system" ) @@ -175,8 +172,8 @@ func (ei *EngineInstance) tryDrpc(ctx context.Context, method drpc.Method) *syst } } -func (ei *EngineInstance) GetBioHealth(ctx context.Context, req *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { - dresp, err := ei.CallDrpc(ctx, drpc.MethodBioHealth, req) +func getBioHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + dresp, err := engine.CallDrpc(ctx, drpc.MethodBioHealth, req) if err != nil { return nil, errors.Wrap(err, "GetBioHealth dRPC call") } @@ -193,8 +190,8 @@ func (ei *EngineInstance) GetBioHealth(ctx context.Context, req *ctlpb.BioHealth return resp, nil } -func (ei *EngineInstance) ListSmdDevices(ctx context.Context, req *ctlpb.SmdDevReq) (*ctlpb.SmdDevResp, error) { - dresp, err := ei.CallDrpc(ctx, drpc.MethodSmdDevs, req) +func listSmdDevices(ctx context.Context, engine Engine, req *ctlpb.SmdDevReq) (*ctlpb.SmdDevResp, error) { + dresp, err := engine.CallDrpc(ctx, drpc.MethodSmdDevs, req) if err != nil { return nil, err } @@ -210,105 +207,3 @@ func (ei *EngineInstance) ListSmdDevices(ctx context.Context, req *ctlpb.SmdDevR return resp, nil } - -func (ei *EngineInstance) getSmdDetails(smd *ctlpb.SmdDevice) (*storage.SmdDevice, error) { - smdDev := new(storage.SmdDevice) - if err := convert.Types(smd, smdDev); err != nil { - return nil, errors.Wrap(err, "convert smd") - } - - engineRank, err := ei.GetRank() - if err != nil { - return nil, errors.Wrapf(err, "get rank") - } - - smdDev.Rank = engineRank - - return smdDev, nil -} - -// updateInUseBdevs updates-in-place the input list of controllers with new NVMe health stats and -// SMD metadata info. -// -// Query each SmdDevice on each I/O Engine instance for health stats and update existing controller -// data in ctrlrMap using PCI address key. -func (ei *EngineInstance) updateInUseBdevs(ctx context.Context, ctrlrs []storage.NvmeController, ms uint64, rs uint64) ([]storage.NvmeController, error) { - ctrlrMap := make(map[string]*storage.NvmeController) - for idx, ctrlr := range ctrlrs { - if _, exists := ctrlrMap[ctrlr.PciAddr]; exists { - return nil, errors.Errorf("duplicate entries for controller %s", - ctrlr.PciAddr) - } - - // Clear SMD info for controllers to remove stale stats. - ctrlrs[idx].SmdDevices = []*storage.SmdDevice{} - // Update controllers in input slice through map by reference. - ctrlrMap[ctrlr.PciAddr] = &ctrlrs[idx] - } - - smdDevs, err := ei.ListSmdDevices(ctx, new(ctlpb.SmdDevReq)) - if err != nil { - return nil, errors.Wrapf(err, "list smd devices") - } - ei.log.Debugf("engine %d: smdDevs %+v", ei.Index(), smdDevs) - - hasUpdatedHealth := make(map[string]bool) - for _, smd := range smdDevs.Devices { - msg := fmt.Sprintf("instance %d: smd %s: ctrlr %s", ei.Index(), smd.Uuid, - smd.Ctrlr.PciAddr) - - ctrlr, exists := ctrlrMap[smd.Ctrlr.PciAddr] - if !exists { - ei.log.Errorf("%s: ctrlr not found", msg) - continue - } - - smdDev, err := ei.getSmdDetails(smd) - if err != nil { - return nil, errors.Wrapf(err, "%s: collect smd info", msg) - } - smdDev.MetaSize = ms - smdDev.RdbSize = rs - - pbStats, err := ei.GetBioHealth(ctx, &ctlpb.BioHealthReq{DevUuid: smdDev.UUID, MetaSize: ms, RdbSize: rs}) - if err != nil { - // Log the error if it indicates non-existent health and the SMD entity has - // an abnormal state. Otherwise it is expected that health may be missing. - status, ok := errors.Cause(err).(daos.Status) - if ok && status == daos.Nonexistent && smdDev.Ctrlr.NvmeState != storage.NvmeStateNormal { - ei.log.Debugf("%s: stats not found (device state: %q), skip update", - msg, smdDev.Ctrlr.NvmeState.String()) - } else { - ei.log.Errorf("%s: fetch stats: %s", msg, err.Error()) - } - ctrlr.UpdateSmd(smdDev) - continue - } - - // Populate space usage for each SMD device from health stats. - smdDev.TotalBytes = pbStats.TotalBytes - smdDev.AvailBytes = pbStats.AvailBytes - smdDev.ClusterSize = pbStats.ClusterSize - smdDev.MetaWalSize = pbStats.MetaWalSize - smdDev.RdbWalSize = pbStats.RdbWalSize - msg = fmt.Sprintf("%s: smd usage = %s/%s", msg, humanize.Bytes(smdDev.AvailBytes), - humanize.Bytes(smdDev.TotalBytes)) - ctrlr.UpdateSmd(smdDev) - - // Multiple SMD entries for the same address key may exist when there are multiple - // NVMe namespaces (and resident blobstores) exist on a single controller. In this - // case only update once as health stats will be the same for each. - if hasUpdatedHealth[ctrlr.PciAddr] { - continue - } - ctrlr.HealthStats = new(storage.NvmeHealth) - if err := convert.Types(pbStats, ctrlr.HealthStats); err != nil { - ei.log.Errorf("%s: update ctrlr health: %s", msg, err.Error()) - continue - } - ei.log.Debugf("%s: ctrlr health updated", msg) - hasUpdatedHealth[ctrlr.PciAddr] = true - } - - return ctrlrs, nil -} diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index 2cc4f1f5443..7b2b38cc57a 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -168,16 +168,3 @@ func (ei *EngineInstance) logScmStorage() error { return nil } - -// ScanBdevTiers calls in to the private engine storage provider to scan bdev -// tiers. Scan will avoid using any cached results if direct is set to true. -func (ei *EngineInstance) ScanBdevTiers() ([]storage.BdevTierScanResult, error) { - isUp := ei.IsReady() - upDn := "down" - if isUp { - upDn = "up" - } - ei.log.Debugf("scanning engine-%d bdev tiers while engine is %s", ei.Index(), upDn) - - return ei.storage.ScanBdevTiers(!isUp) -} diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index 35821e5e378..9ce68660f25 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -21,6 +21,12 @@ import ( "github.com/daos-stack/daos/src/control/server/storage" ) +var ( + scanSmd = listSmdDevices + getCtrlrHealth = getBioHealth + errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance") +) + // newMntRet creates and populates SCM mount result. // Currently only used for format operations. func (ei *EngineInstance) newMntRet(mountPoint string, inErr error) *ctlpb.ScmMountResult { @@ -73,11 +79,26 @@ func (ei *EngineInstance) scmFormat(force bool) (*ctlpb.ScmMountResult, error) { return ei.newMntRet(cfg.Scm.MountPoint, nil), nil } -func (ei *EngineInstance) bdevFormat() (results proto.NvmeControllerResults) { +func formatEngineBdevs(ei *EngineInstance, ctrlrs storage.NvmeControllers) (results proto.NvmeControllerResults) { + // If no superblock exists, format NVMe and populate response with results. + needsSuperblock, err := ei.NeedsSuperblock() + if err != nil { + ei.log.Errorf("engine storage for %s instance %d: NeedsSuperblock(): %s", + build.DataPlaneName, ei.Index(), err) + + return proto.NvmeControllerResults{ + ei.newCret("", err), + } + } + + if !needsSuperblock { + return + } + defer ei.logDuration(track(fmt.Sprintf( "Format of NVMe storage for %s instance %d", build.DataPlaneName, ei.Index()))) - for _, tr := range ei.storage.FormatBdevTiers() { + for _, tr := range ei.storage.FormatBdevTiers(ctrlrs) { if tr.Error != nil { results = append(results, ei.newCret(fmt.Sprintf("tier %d", tr.Tier), tr.Error)) continue @@ -137,58 +158,174 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe return } -// StorageFormatNVMe performs format on NVMe if superblock needs writing. -func (ei *EngineInstance) StorageFormatNVMe() (cResults proto.NvmeControllerResults) { - // If no superblock exists, format NVMe and populate response with results. - needsSuperblock, err := ei.NeedsSuperblock() +func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController) (bool, error) { + state := ctrlr.DevState + if state != ctlpb.NvmeDevState_NORMAL && state != ctlpb.NvmeDevState_EVICTED { + engine.Debugf("skip fetching health stats on device %q in %q state", + ctrlr.PciAddr, ctlpb.NvmeDevState_name[int32(state)]) + return false, nil + } + + health, err := getCtrlrHealth(ctx, engine, req) if err != nil { - ei.log.Errorf("engine storage for %s instance %d: NeedsSuperblock(): %s", - build.DataPlaneName, ei.Index(), err) + return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr, + state) + } + ctrlr.HealthStats = health - return proto.NvmeControllerResults{ - ei.newCret("", err), + return true, nil +} + +// Scan SMD devices over dRPC and reconstruct NVMe scan response from results. +func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) { + scanSmdResp, err := scanSmd(ctx, engine, &ctlpb.SmdDevReq{}) + if err != nil { + return nil, errors.Wrap(err, "scan smd") + } + if scanSmdResp == nil { + return nil, errors.New("nil smd scan resp") + } + + // Re-link SMD devices inside NVMe controller structures and populate scan response. + + pbResp := ctlpb.ScanNvmeResp{ + State: new(ctlpb.ResponseState), + } + seenCtrlrs := make(map[string]*ctlpb.NvmeController) + + for _, sd := range scanSmdResp.Devices { + if sd.Ctrlr == nil { + return nil, errors.Errorf("smd %q has no ctrlr ref", sd.Uuid) } + + addr := sd.Ctrlr.PciAddr + + if _, exists := seenCtrlrs[addr]; !exists { + c := new(ctlpb.NvmeController) + *c = *sd.Ctrlr + c.SmdDevices = nil + c.HealthStats = nil + seenCtrlrs[addr] = c + pbResp.Ctrlrs = append(pbResp.Ctrlrs, c) + } + + c := seenCtrlrs[addr] + + // Populate health if requested. + healthUpdated := false + if pbReq.Health { + bhReq := &ctlpb.BioHealthReq{ + DevUuid: sd.Uuid, + MetaSize: pbReq.MetaSize, + RdbSize: pbReq.RdbSize, + } + upd, err := populateCtrlrHealth(ctx, engine, bhReq, c) + if err != nil { + return nil, err + } + healthUpdated = upd + } + + // Populate SMD (meta) if requested. + if pbReq.Meta { + nsd := new(ctlpb.SmdDevice) + *nsd = *sd + nsd.Ctrlr = nil + nsd.MetaSize = pbReq.MetaSize + nsd.RdbSize = pbReq.RdbSize + if healthUpdated { + // Populate space usage for each SMD device from health stats. + nsd.TotalBytes = c.HealthStats.TotalBytes + nsd.AvailBytes = c.HealthStats.AvailBytes + nsd.ClusterSize = c.HealthStats.ClusterSize + nsd.MetaWalSize = c.HealthStats.MetaWalSize + nsd.RdbWalSize = c.HealthStats.RdbWalSize + } + engineRank, err := engine.GetRank() + if err != nil { + return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) + } + nsd.Rank = engineRank.Uint32() + c.SmdDevices = append(c.SmdDevices, nsd) + } + } + + return &pbResp, nil +} + +func bdevScanEngineAssigned(ctx context.Context, engine Engine, pbReq *ctlpb.ScanNvmeReq, devList *storage.BdevDeviceList, isStarted *bool) (*ctlpb.ScanNvmeResp, error) { + *isStarted = engine.IsStarted() + if !*isStarted { + engine.Debugf("scanning engine-%d bdev tiers while engine is down", engine.Index()) + + // Retrieve engine cfg bdevs to restrict scan scope. + req := storage.BdevScanRequest{DeviceList: devList} + + return bdevScanToProtoResp(engine.GetStorage().ScanBdevs, req) } - if needsSuperblock { - cResults = ei.bdevFormat() + engine.Debugf("scanning engine-%d bdev tiers while engine is up", engine.Index()) + + // If engine is started but not ready, wait for ready state. If partial number of engines + // return results, indicate errors for non-ready engines whilst returning successful scan + // results. + pollFn := func(e Engine) bool { return e.IsReady() } + if err := pollInstanceState(ctx, []Engine{engine}, pollFn); err != nil { + return nil, errors.Wrapf(err, "waiting for engine %d to be ready to receive drpcs", + engine.Index()) } - return + return scanEngineBdevsOverDrpc(ctx, engine, pbReq) } -func smdGetHealth(ctx context.Context, ei *EngineInstance, dev *ctlpb.SmdDevice) error { - state := dev.Ctrlr.DevState - if state != ctlpb.NvmeDevState_NORMAL && state != ctlpb.NvmeDevState_EVICTED { - ei.log.Debugf("skip fetching health stats on device %q in %q state", dev, - ctlpb.NvmeDevState_name[int32(state)]) - return nil +// bdevScanEngine calls either in to the private engine storage provider to scan bdevs if engine process +// is not started, otherwise dRPC is used to retrieve details from the online engine. +func bdevScanEngine(ctx context.Context, engine Engine, req *ctlpb.ScanNvmeReq) (resp *ctlpb.ScanNvmeResp, err error) { + if req == nil { + return nil, errors.New("nil request") } - health, err := ei.GetBioHealth(ctx, &ctlpb.BioHealthReq{DevUuid: dev.Uuid}) + eCfgBdevs := storage.TierConfigs(engine.GetStorage().GetBdevConfigs()).Bdevs() + if eCfgBdevs.Len() == 0 { + return nil, errEngineBdevScanEmptyDevList + } + + var isStarted bool + resp, err = bdevScanEngineAssigned(ctx, engine, req, eCfgBdevs, &isStarted) if err != nil { - return errors.Wrapf(err, "device %q, state %q", dev, state) + return nil, err } - dev.Ctrlr.HealthStats = health - return nil -} + // Retry once if engine provider scan returns unexpected number of controllers in case + // engines claimed devices between when started state was checked and scan was executed. + if !isStarted && len(resp.Ctrlrs) != eCfgBdevs.Len() { + engine.Debugf("retrying engine bdev scan as unexpected nr returned, want %d got %d", + eCfgBdevs.Len(), len(resp.Ctrlrs)) -func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq) (*ctlpb.SmdQueryResp_RankResp, error) { - ei, ok := engine.(*EngineInstance) - if !ok { - return nil, errors.New("not EngineInstance") + resp, err = bdevScanEngineAssigned(ctx, engine, req, eCfgBdevs, &isStarted) + if err != nil { + return nil, err + } + } + + if len(resp.Ctrlrs) != eCfgBdevs.Len() { + engine.Debugf("engine bdev scan returned unexpected nr, want %d got %d", + eCfgBdevs.Len(), len(resp.Ctrlrs)) } - engineRank, err := ei.GetRank() + return +} + +func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq) (*ctlpb.SmdQueryResp_RankResp, error) { + engineRank, err := engine.GetRank() if err != nil { - return nil, errors.Wrapf(err, "instance %d GetRank", ei.Index()) + return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) } rResp := new(ctlpb.SmdQueryResp_RankResp) rResp.Rank = engineRank.Uint32() - listDevsResp, err := ei.ListSmdDevices(ctx, new(ctlpb.SmdDevReq)) + listDevsResp, err := listSmdDevices(ctx, engine, new(ctlpb.SmdDevReq)) if err != nil { return nil, errors.Wrapf(err, "rank %d", engineRank) } @@ -198,7 +335,6 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq return rResp, nil } - // For each SmdDevice returned in list devs response, append a SmdDeviceWithHealth. for _, sd := range listDevsResp.Devices { if sd != nil { rResp.Devices = append(rResp.Devices, sd) @@ -211,7 +347,8 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq continue // Skip health query if UUID doesn't match requested. } if pbReq.IncludeBioHealth { - if err := smdGetHealth(ctx, ei, dev); err != nil { + bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid} + if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr); err != nil { return nil, err } } diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go new file mode 100644 index 00000000000..dfa0a28bd3b --- /dev/null +++ b/src/control/server/instance_storage_rpc_test.go @@ -0,0 +1,287 @@ +// +// (C) Copyright 2023 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package server + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/proto" + ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" + "github.com/daos-stack/daos/src/control/server/config" + "github.com/daos-stack/daos/src/control/server/engine" + "github.com/daos-stack/daos/src/control/server/storage" + "github.com/daos-stack/daos/src/control/server/storage/bdev" + "github.com/daos-stack/daos/src/control/server/storage/scm" +) + +func TestIOEngineInstance_bdevScanEngine(t *testing.T) { + c := storage.MockNvmeController(2) + defSmdScanRes := func() *ctlpb.SmdDevResp { + return &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{ + proto.MockSmdDevice(c, 2), + }, + } + } + healthRespWithUsage := func() *ctlpb.BioHealthResp { + mh := proto.MockNvmeHealth(2) + mh.TotalBytes, mh.AvailBytes, mh.ClusterSize = 1, 2, 3 + mh.MetaWalSize, mh.RdbWalSize = 4, 5 + return mh + } + + for name, tc := range map[string]struct { + req ctlpb.ScanNvmeReq + bdevAddrs []string + provRes *storage.BdevScanResponse + provErr error + engStopped bool + smdRes *ctlpb.SmdDevResp + smdErr error + healthRes *ctlpb.BioHealthResp + healthErr error + expResp *ctlpb.ScanNvmeResp + expErr error + expBackendScanCalls []storage.BdevScanRequest + }{ + "no bdevs in cfg": { + bdevAddrs: []string{}, + expErr: errors.New("empty device list"), + }, + "engines stopped; scan over engine provider": { + bdevAddrs: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, + engStopped: true, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + storage.MockNvmeController(1), + storage.MockNvmeController(2), + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "engines stopped; scan over engine provider; retry on empty response": { + bdevAddrs: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, + engStopped: true, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(1), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "engines stopped; scan fails over engine provider": { + engStopped: true, + provErr: errors.New("provider scan fail"), + expErr: errors.New("provider scan fail"), + }, + "scan over drpc; no health or meta": { + smdRes: defSmdScanRes(), + healthRes: proto.MockNvmeHealth(2), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = nil + c.SmdDevices = nil + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan fails over drpc": { + smdErr: errors.New("drpc fail"), + expErr: errors.New("drpc fail"), + }, + "scan over drpc; with health": { + req: ctlpb.ScanNvmeReq{Health: true}, + smdRes: defSmdScanRes(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = healthRespWithUsage() + c.SmdDevices = nil + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with smd": { + req: ctlpb.ScanNvmeReq{Meta: true}, + smdRes: defSmdScanRes(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = nil + c.SmdDevices = []*ctlpb.SmdDevice{ + proto.MockSmdDevice(nil, 2), + } + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with smd and health; usage and wal size reported": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + smdRes: defSmdScanRes(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = healthRespWithUsage() + sd := proto.MockSmdDevice(nil, 2) + sd.TotalBytes = c.HealthStats.TotalBytes + sd.AvailBytes = c.HealthStats.AvailBytes + sd.ClusterSize = c.HealthStats.ClusterSize + sd.MetaWalSize = c.HealthStats.MetaWalSize + sd.RdbWalSize = c.HealthStats.RdbWalSize + c.SmdDevices = []*ctlpb.SmdDevice{sd} + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with smd and health; missing ctrlr in smd": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr = nil + return ssr + }(), + healthRes: healthRespWithUsage(), + expErr: errors.New("no ctrlr ref"), + }, + "scan over drpc; with smd and health; health scan fails": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + smdRes: defSmdScanRes(), + healthErr: errors.New("health scan failed"), + expErr: errors.New("health scan failed"), + }, + "scan over drpc; with smd and health; smd list fails": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + smdErr: errors.New("smd scan failed"), + healthRes: healthRespWithUsage(), + expErr: errors.New("smd scan failed"), + }, + "scan over drpc; with smd and health; nil smd list returned": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + healthRes: healthRespWithUsage(), + expErr: errors.New("nil smd scan resp"), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + scanSmd = func(_ context.Context, _ Engine, _ *ctlpb.SmdDevReq) (*ctlpb.SmdDevResp, error) { + return tc.smdRes, tc.smdErr + } + defer func() { + scanSmd = listSmdDevices + }() + getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + return tc.healthRes, tc.healthErr + } + defer func() { + getCtrlrHealth = getBioHealth + }() + + if tc.provRes == nil { + tc.provRes = defProviderScanRes + } + + ec := engine.MockConfig() + if tc.bdevAddrs == nil { + tc.bdevAddrs = []string{test.MockPCIAddr(1)} + } + ec.WithStorage(storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(tc.bdevAddrs...)) + + sCfg := config.DefaultServer().WithEngines(ec) + + bmbc := &bdev.MockBackendConfig{ + ScanRes: tc.provRes, + ScanErr: tc.provErr, + } + bmb := bdev.NewMockBackend(bmbc) + smb := scm.NewMockBackend(nil) + + cs := newMockControlServiceFromBackends(t, log, sCfg, bmb, smb, nil, + tc.engStopped) + ei := cs.harness.Instances()[0].(*EngineInstance) + + resp, err := bdevScanEngine(test.Context(t), ei, &tc.req) + test.CmpErr(t, tc.expErr, err) + if err != nil { + return + } + + if diff := cmp.Diff(tc.expResp, resp, + defStorageScanCmpOpts...); diff != "" { + t.Fatalf("unexpected response (-want, +got):\n%s\n", diff) + } + + cmpopt := cmp.Comparer(func(x, y *storage.BdevDeviceList) bool { + if x == nil && y == nil { + return true + } + return x.Equals(y) + }) + + bmb.RLock() + if len(tc.expBackendScanCalls) != len(bmb.ScanCalls) { + t.Fatalf("unexpected number of backend scan calls, want %d got %d", + len(tc.expBackendScanCalls), len(bmb.ScanCalls)) + } + if len(tc.expBackendScanCalls) == 0 { + return + } + if diff := cmp.Diff(tc.expBackendScanCalls, bmb.ScanCalls, + append(defStorageScanCmpOpts, cmpopt)...); diff != "" { + t.Fatalf("unexpected backend scan calls (-want, +got):\n%s\n", diff) + } + bmb.RUnlock() + }) + } +} diff --git a/src/control/server/instance_test.go b/src/control/server/instance_test.go index 70511daa3ec..ea088285467 100644 --- a/src/control/server/instance_test.go +++ b/src/control/server/instance_test.go @@ -280,3 +280,7 @@ func (mi *MockInstance) StorageFormatSCM(context.Context, bool) *ctlpb.ScmMountR func (mi *MockInstance) GetStorage() *storage.Provider { return nil } + +func (mi *MockInstance) Debugf(format string, args ...interface{}) { + return +} diff --git a/src/control/server/server.go b/src/control/server/server.go index c8812e3b987..4b62244e818 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -283,8 +283,12 @@ func (srv *server) createEngine(ctx context.Context, idx int, cfg *engine.Config return control.SystemJoin(ctxIn, srv.mgmtSvc.rpcClient, req) } - engine := NewEngineInstance(srv.log, storage.DefaultProvider(srv.log, idx, &cfg.Storage), joinFn, - engine.NewRunner(srv.log, cfg)).WithHostFaultDomain(srv.harness.faultDomain) + sp := storage.DefaultProvider(srv.log, idx, &cfg.Storage). + WithVMDEnabled(srv.ctlSvc.storage.IsVMDEnabled()) + + engine := NewEngineInstance(srv.log, sp, joinFn, engine.NewRunner(srv.log, cfg)). + WithHostFaultDomain(srv.harness.faultDomain) + if idx == 0 { configureFirstEngine(ctx, engine, srv.sysdb, joinFn) } @@ -308,29 +312,16 @@ func (srv *server) addEngines(ctx context.Context) error { return err } - // Retrieve NVMe device details (before engines are started) so static details can be - // recovered by the engine storage provider(s) during scan even if devices are in use. - nvmeScanResp, err := scanBdevStorage(srv) - if err != nil { - return err - } - if len(srv.cfg.Engines) == 0 { return nil } - nrEngineBdevsIdx := -1 - nrEngineBdevs := -1 for i, c := range srv.cfg.Engines { engine, err := srv.createEngine(ctx, i, c) if err != nil { return errors.Wrap(err, "creating engine instances") } - if err := setEngineBdevs(engine, nvmeScanResp, &nrEngineBdevsIdx, &nrEngineBdevs); err != nil { - return errors.Wrap(err, "setting engine bdevs") - } - registerEngineEventCallbacks(srv, engine, &allStarted) if err := srv.harness.AddInstance(engine); err != nil { diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go index 6e059aef32e..596bb0c50a9 100644 --- a/src/control/server/server_utils.go +++ b/src/control/server/server_utils.go @@ -355,77 +355,6 @@ func prepBdevStorage(srv *server, iommuEnabled bool) error { return nil } -// scanBdevStorage performs discovery and validates existence of configured NVMe SSDs. -func scanBdevStorage(srv *server) (*storage.BdevScanResponse, error) { - defer srv.logDuration(track("time to scan bdev storage")) - - if srv.cfg.DisableHugepages { - srv.log.Debugf("skip nvme scan as hugepages have been disabled in config") - return &storage.BdevScanResponse{}, nil - } - - nvmeScanResp, err := srv.ctlSvc.NvmeScan(storage.BdevScanRequest{ - DeviceList: getBdevCfgsFromSrvCfg(srv.cfg).Bdevs(), - }) - if err != nil { - err = errors.Wrap(err, "NVMe Scan Failed") - srv.log.Errorf("%s", err) - return nil, err - } - - return nvmeScanResp, nil -} - -func setEngineBdevs(engine *EngineInstance, scanResp *storage.BdevScanResponse, lastEngineIdx, lastBdevCount *int) error { - badInput := "" - switch { - case engine == nil: - badInput = "engine" - case scanResp == nil: - badInput = "scanResp" - case lastEngineIdx == nil: - badInput = "lastEngineIdx" - case lastBdevCount == nil: - badInput = "lastBdevCount" - } - if badInput != "" { - return errors.New("nil input param: " + badInput) - } - - if err := engine.storage.SetBdevCache(*scanResp); err != nil { - return errors.Wrap(err, "setting engine storage bdev cache") - } - - // After engine's bdev cache has been set, the cache will only contain details of bdevs - // identified in the relevant engine config and device addresses will have been verified - // against NVMe scan results. As any VMD endpoint addresses will have been replaced with - // backing device addresses, device counts will reflect the number of physical (as opposed - // to logical) bdevs and engine bdev counts can be accurately compared. - - eIdx := engine.Index() - bdevCache := engine.storage.GetBdevCache() - newNrBdevs := len(bdevCache.Controllers) - - // Update last recorded counters if this is the first update or if the number of bdevs is - // unchanged. If bdev count differs between engines, return fault. - switch { - case *lastEngineIdx < 0: - if *lastBdevCount >= 0 { - return errors.New("expecting both lastEngineIdx and lastBdevCount to be unset") - } - *lastEngineIdx = int(eIdx) - *lastBdevCount = newNrBdevs - case *lastBdevCount < 0: - return errors.New("expecting both lastEngineIdx and lastBdevCount to be set") - case newNrBdevs == *lastBdevCount: - *lastEngineIdx = int(eIdx) - default: - return config.FaultConfigBdevCountMismatch(int(eIdx), newNrBdevs, *lastEngineIdx, *lastBdevCount) - } - - return nil -} - func setDaosHelperEnvs(cfg *config.Server, setenv func(k, v string) error) error { if cfg.HelperLogFile != "" { if err := setenv(pbin.DaosPrivHelperLogFileEnvVar, cfg.HelperLogFile); err != nil { diff --git a/src/control/server/server_utils_test.go b/src/control/server/server_utils_test.go index 1fb0567fadd..7fd1ac73859 100644 --- a/src/control/server/server_utils_test.go +++ b/src/control/server/server_utils_test.go @@ -842,183 +842,6 @@ func TestServer_checkEngineTmpfsMem(t *testing.T) { } } -// TestServer_scanBdevStorage validates that an error is returned in the case that a SSD is not -// found and doesn't return an error if SPDK fails to init. -func TestServer_scanBdevStorage(t *testing.T) { - for name, tc := range map[string]struct { - disableHugepages bool - bmbc *bdev.MockBackendConfig - expErr error - }{ - "spdk fails init": { - bmbc: &bdev.MockBackendConfig{ - ScanErr: errors.New("spdk failed"), - }, - expErr: errors.New("spdk failed"), - }, - "bdev in config not found by spdk": { - bmbc: &bdev.MockBackendConfig{ - ScanErr: storage.FaultBdevNotFound(test.MockPCIAddr()), - }, - expErr: storage.FaultBdevNotFound(test.MockPCIAddr()), - }, - "successful scan": { - bmbc: &bdev.MockBackendConfig{ - ScanRes: &storage.BdevScanResponse{ - Controllers: storage.MockNvmeControllers(1), - }, - }, - }, - "hugepages disabled": { - disableHugepages: true, - bmbc: &bdev.MockBackendConfig{ - ScanErr: errors.New("spdk failed"), - }, - }, - } { - t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(name) - defer test.ShowBufferOnFailure(t, buf) - - cfg := config.DefaultServer().WithFabricProvider("ofi+verbs"). - WithDisableHugepages(tc.disableHugepages) - - if err := cfg.Validate(log); err != nil { - t.Fatal(err) - } - - srv, err := newServer(log, cfg, &system.FaultDomain{}) - if err != nil { - t.Fatal(err) - } - - mbb := bdev.NewMockBackend(tc.bmbc) - mbp := bdev.NewProvider(log, mbb) - sp := sysprov.NewMockSysProvider(log, nil) - - srv.ctlSvc = &ControlService{ - StorageControlService: *NewMockStorageControlService(log, cfg.Engines, - sp, - scm.NewProvider(log, scm.NewMockBackend(nil), sp, nil), - mbp, nil), - srvCfg: cfg, - } - - _, gotErr := scanBdevStorage(srv) - test.CmpErr(t, tc.expErr, gotErr) - }) - } -} - -func TestServer_setEngineBdevs(t *testing.T) { - for name, tc := range map[string]struct { - cfg engine.Config - engineIdx uint32 - scanResp *storage.BdevScanResponse - lastEngineIdx int - lastBdevCount int - expErr error - expLastEngineIdx int - expLastBdevCount int - }{ - "nil input": { - expErr: errors.New("nil input param: scanResp"), - }, - "empty cache": { - scanResp: &storage.BdevScanResponse{}, - lastEngineIdx: -1, - lastBdevCount: -1, - }, - "index unset; bdev count set": { - scanResp: &storage.BdevScanResponse{}, - lastEngineIdx: -1, - lastBdevCount: 0, - expErr: errors.New("to be unset"), - }, - "index set; bdev count unset": { - scanResp: &storage.BdevScanResponse{}, - lastEngineIdx: 0, - lastBdevCount: -1, - expErr: errors.New("to be set"), - }, - "empty cache; counts match": { - engineIdx: 1, - scanResp: &storage.BdevScanResponse{}, - lastEngineIdx: 0, - lastBdevCount: 0, - expLastEngineIdx: 1, - }, - "empty cache; count mismatch": { - engineIdx: 1, - scanResp: &storage.BdevScanResponse{}, - lastEngineIdx: 0, - lastBdevCount: 1, - expErr: errors.New("engine 1 has 0 but engine 0 has 1"), - }, - "populated cache; cache miss": { - engineIdx: 1, - scanResp: &storage.BdevScanResponse{Controllers: storage.MockNvmeControllers(1)}, - lastEngineIdx: 0, - lastBdevCount: 1, - expErr: errors.New("engine 1 has 0 but engine 0 has 1"), - }, - "populated cache; cache hit": { - cfg: *engine.MockConfig(). - WithStorage( - storage.NewTierConfig(). - WithStorageClass("nvme"). - WithBdevDeviceList("0000:00:00.0"), - ), - engineIdx: 1, - scanResp: &storage.BdevScanResponse{Controllers: storage.MockNvmeControllers(1)}, - lastEngineIdx: 0, - lastBdevCount: 1, - expLastEngineIdx: 1, - expLastBdevCount: 1, - }, - "populated cache; multiple vmd backing devices": { - cfg: *engine.MockConfig(). - WithStorage( - storage.NewTierConfig(). - WithStorageClass("nvme"). - WithBdevDeviceList("0000:05:05.5", "0000:5d:05.5"), - ), - engineIdx: 1, - scanResp: &storage.BdevScanResponse{ - Controllers: storage.NvmeControllers{ - &storage.NvmeController{PciAddr: "5d0505:01:00.0"}, - &storage.NvmeController{PciAddr: "5d0505:03:00.0"}, - &storage.NvmeController{PciAddr: "050505:01:00.0"}, - &storage.NvmeController{PciAddr: "050505:02:00.0"}, - }, - }, - lastEngineIdx: 0, - lastBdevCount: 4, - expLastEngineIdx: 1, - expLastBdevCount: 4, - }, - } { - t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(name) - defer test.ShowBufferOnFailure(t, buf) - - engine := NewEngineInstance(log, - storage.DefaultProvider(log, int(tc.engineIdx), &tc.cfg.Storage), - nil, engine.NewRunner(log, &tc.cfg)) - engine.setIndex(tc.engineIdx) - - gotErr := setEngineBdevs(engine, tc.scanResp, &tc.lastEngineIdx, &tc.lastBdevCount) - test.CmpErr(t, tc.expErr, gotErr) - if tc.expErr != nil { - return - } - - test.AssertEqual(t, tc.expLastEngineIdx, tc.lastEngineIdx, "unexpected last engine index") - test.AssertEqual(t, tc.expLastBdevCount, tc.lastBdevCount, "unexpected last bdev count") - }) - } -} - func testFabricProviderSet(prov ...string) *hardware.FabricProviderSet { providers := []*hardware.FabricProvider{} for _, p := range prov { diff --git a/src/control/server/storage/bdev.go b/src/control/server/storage/bdev.go index bdc231a3be8..d69326b4aaa 100644 --- a/src/control/server/storage/bdev.go +++ b/src/control/server/storage/bdev.go @@ -502,9 +502,8 @@ type ( // BdevScanRequest defines the parameters for a Scan operation. BdevScanRequest struct { pbin.ForwardableRequest - DeviceList *BdevDeviceList - VMDEnabled bool - BypassCache bool + DeviceList *BdevDeviceList + VMDEnabled bool } // BdevScanResponse contains information gleaned during a successful Scan operation. @@ -525,12 +524,12 @@ type ( // BdevFormatRequest defines the parameters for a Format operation. BdevFormatRequest struct { pbin.ForwardableRequest - Properties BdevTierProperties - OwnerUID int - OwnerGID int - VMDEnabled bool - Hostname string - BdevCache *BdevScanResponse + Properties BdevTierProperties + OwnerUID int + OwnerGID int + Hostname string + VMDEnabled bool + ScannedBdevs NvmeControllers // VMD needs address mapping for backing devices. } // BdevWriteConfigRequest defines the parameters for a WriteConfig operation. @@ -540,14 +539,14 @@ type ( OwnerUID int OwnerGID int TierProps []BdevTierProperties - VMDEnabled bool HotplugEnabled bool HotplugBusidBegin uint8 HotplugBusidEnd uint8 Hostname string - BdevCache *BdevScanResponse AccelProps AccelProps SpdkRpcSrvProps SpdkRpcServer + VMDEnabled bool + ScannedBdevs NvmeControllers // VMD needs address mapping for backing devices. } // BdevWriteConfigResponse contains the result of a WriteConfig operation. diff --git a/src/control/server/storage/bdev/backend.go b/src/control/server/storage/bdev/backend.go index 464bc680cb9..4836947f70b 100644 --- a/src/control/server/storage/bdev/backend.go +++ b/src/control/server/storage/bdev/backend.go @@ -273,7 +273,7 @@ func (sb *spdkBackend) Prepare(req storage.BdevPrepareRequest) (*storage.BdevPre // groomDiscoveredBdevs ensures that for a non-empty device list, restrict output controller data // to only those devices discovered and in device list and confirm that the devices specified in -// the device list have all been discovered. +// the device list have all been discovered. VMD addresses with no backing devices return error. func groomDiscoveredBdevs(reqDevs *hardware.PCIAddressSet, discovered storage.NvmeControllers, vmdEnabled bool) (storage.NvmeControllers, error) { // if the request does not specify a device filter, return all discovered controllers if reqDevs.IsEmpty() { @@ -319,7 +319,7 @@ func groomDiscoveredBdevs(reqDevs *hardware.PCIAddressSet, discovered storage.Nv } if !missing.IsEmpty() { - return nil, storage.FaultBdevNotFound(missing.Strings()...) + return nil, storage.FaultBdevNotFound(vmdEnabled, missing.Strings()...) } return out, nil @@ -466,7 +466,7 @@ func (sb *spdkBackend) formatNvme(req *storage.BdevFormatRequest) (*storage.Bdev if req.VMDEnabled { sb.log.Debug("vmd support enabled during nvme format") - dl, err := substituteVMDAddresses(sb.log, needDevs, req.BdevCache) + dl, err := substituteVMDAddresses(sb.log, needDevs, req.ScannedBdevs) if err != nil { return nil, err } @@ -501,7 +501,6 @@ func (sb *spdkBackend) formatNvme(req *storage.BdevFormatRequest) (*storage.Bdev func (sb *spdkBackend) Format(req storage.BdevFormatRequest) (resp *storage.BdevFormatResponse, err error) { sb.log.Debugf("spdk backend format (bindings call): %+v", req) - // TODO (DAOS-3844): Kick off device formats in parallel? switch req.Properties.Class { case storage.ClassFile: return sb.formatAioFile(&req) @@ -529,7 +528,7 @@ func (sb *spdkBackend) writeNvmeConfig(req storage.BdevWriteConfigRequest, confW bdevs := &props.DeviceList.PCIAddressSet - dl, err := substituteVMDAddresses(sb.log, bdevs, req.BdevCache) + dl, err := substituteVMDAddresses(sb.log, bdevs, req.ScannedBdevs) if err != nil { return errors.Wrapf(err, "storage tier %d", props.Tier) } diff --git a/src/control/server/storage/bdev/backend_test.go b/src/control/server/storage/bdev/backend_test.go index 76780e6ea87..53a3a0f9bf1 100644 --- a/src/control/server/storage/bdev/backend_test.go +++ b/src/control/server/storage/bdev/backend_test.go @@ -125,14 +125,22 @@ func TestBackend_groomDiscoveredBdevs(t *testing.T) { "missing": { reqAddrList: []string{ctrlr1.PciAddr, ctrlr2.PciAddr, ctrlr3.PciAddr}, inCtrlrs: storage.NvmeControllers{ctrlr1, ctrlr3}, - expErr: storage.FaultBdevNotFound(ctrlr2.PciAddr), + expErr: storage.FaultBdevNotFound(false, ctrlr2.PciAddr), }, "vmd devices; vmd not enabled": { reqAddrList: []string{"0000:85:05.5"}, inCtrlrs: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", "850505:14:00.0", "5d0505:03:00.0"), - expErr: storage.FaultBdevNotFound("0000:85:05.5"), + expErr: storage.FaultBdevNotFound(false, "0000:85:05.5"), + }, + "vmd enabled; missing backing devices": { + vmdEnabled: true, + reqAddrList: []string{"0000:85:05.5", "0000:05:05.5"}, + inCtrlrs: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", + "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", + "850505:11:00.0", "850505:14:00.0", "5d0505:03:00.0"), + expErr: storage.FaultBdevNotFound(true, "0000:05:05.5"), }, "vmd devices; vmd enabled": { vmdEnabled: true, @@ -205,7 +213,7 @@ func TestBackend_Scan(t *testing.T) { DiscoverCtrlrs: storage.NvmeControllers{ctrlr1}, }, req: mockScanReq(storage.MockNvmeController(2).PciAddr), - expErr: storage.FaultBdevNotFound(storage.MockNvmeController(2).PciAddr), + expErr: storage.FaultBdevNotFound(false, storage.MockNvmeController(2).PciAddr), }, "emulated nvme; AIO-file": { req: mockScanReq(storage.MockNvmeAioFile(2).Path), @@ -512,10 +520,8 @@ func TestBackend_Format(t *testing.T) { Class: storage.ClassNvme, DeviceList: storage.MustNewBdevDeviceList(vmdAddr), }, - VMDEnabled: true, - BdevCache: &storage.BdevScanResponse{ - Controllers: mockCtrlrsInclVMD(), - }, + VMDEnabled: true, + ScannedBdevs: mockCtrlrsInclVMD(), }, expResp: &storage.BdevFormatResponse{ DeviceResponses: map[string]*storage.BdevDeviceFormatResponse{ @@ -649,9 +655,7 @@ func TestBackend_writeNvmeConfig(t *testing.T) { DeviceList: storage.MustNewBdevDeviceList(vmdAddr), }, }, - BdevCache: &storage.BdevScanResponse{ - Controllers: mockCtrlrsInclVMD(), - }, + ScannedBdevs: mockCtrlrsInclVMD(), }, expCall: &storage.BdevWriteConfigRequest{ VMDEnabled: true, @@ -661,9 +665,7 @@ func TestBackend_writeNvmeConfig(t *testing.T) { DeviceList: storage.MustNewBdevDeviceList(vmdBackingAddr1, vmdBackingAddr2), }, }, - BdevCache: &storage.BdevScanResponse{ - Controllers: mockCtrlrsInclVMD(), - }, + ScannedBdevs: mockCtrlrsInclVMD(), }, }, } { diff --git a/src/control/server/storage/bdev/backend_vmd.go b/src/control/server/storage/bdev/backend_vmd.go index 20b5d1b0720..027bf725bb2 100644 --- a/src/control/server/storage/bdev/backend_vmd.go +++ b/src/control/server/storage/bdev/backend_vmd.go @@ -118,19 +118,19 @@ func substVMDAddrs(inPCIAddrs *hardware.PCIAddressSet, foundCtrlrs storage.NvmeC // substituteVMDAddresses wraps around substVMDAddrs to substitute VMD addresses with the relevant // backing device addresses. // Function takes a BdevScanResponse reference to derive address map and a logger. -func substituteVMDAddresses(log logging.Logger, inPCIAddrs *hardware.PCIAddressSet, bdevCache *storage.BdevScanResponse) (*hardware.PCIAddressSet, error) { +func substituteVMDAddresses(log logging.Logger, inPCIAddrs *hardware.PCIAddressSet, ctrlrs storage.NvmeControllers) (*hardware.PCIAddressSet, error) { if inPCIAddrs == nil { return nil, errors.New("nil input PCIAddressSet") } - if bdevCache == nil || len(bdevCache.Controllers) == 0 { - log.Debugf("no bdev cache to find vmd backing devices (devs: %v)", inPCIAddrs) + if len(ctrlrs) == 0 { + log.Debugf("no bdev info to find vmd backing devices (devs: %v)", inPCIAddrs) return inPCIAddrs, nil } msg := fmt.Sprintf("vmd detected, processing addresses (input %v, existing %v)", - inPCIAddrs, bdevCache.Controllers) + inPCIAddrs, ctrlrs) - dl, err := substVMDAddrs(inPCIAddrs, bdevCache.Controllers) + dl, err := substVMDAddrs(inPCIAddrs, ctrlrs) if err != nil { return nil, errors.Wrapf(err, msg) } diff --git a/src/control/server/storage/bdev/backend_vmd_test.go b/src/control/server/storage/bdev/backend_vmd_test.go index 7038da2a4e5..ab40829af88 100644 --- a/src/control/server/storage/bdev/backend_vmd_test.go +++ b/src/control/server/storage/bdev/backend_vmd_test.go @@ -28,46 +28,38 @@ const ( func TestBackend_substituteVMDAddresses(t *testing.T) { for name, tc := range map[string]struct { - inAddrs *hardware.PCIAddressSet - bdevCache *storage.BdevScanResponse - expOutAddrs *hardware.PCIAddressSet - expErr error + inAddrs *hardware.PCIAddressSet + scannedBdevs storage.NvmeControllers + expOutAddrs *hardware.PCIAddressSet + expErr error }{ "one vmd requested; no backing devices": { inAddrs: addrListFromStrings(vmdAddr), - bdevCache: &storage.BdevScanResponse{ - Controllers: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", - "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", - "850505:11:00.0", "850505:14:00.0"), - }, + scannedBdevs: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", + "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", + "850505:11:00.0", "850505:14:00.0"), expOutAddrs: addrListFromStrings(vmdAddr), }, "one vmd requested; two backing devices": { - inAddrs: addrListFromStrings(vmdAddr), - bdevCache: &storage.BdevScanResponse{ - Controllers: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2), - }, - expOutAddrs: addrListFromStrings(vmdBackingAddr1, vmdBackingAddr2), + inAddrs: addrListFromStrings(vmdAddr), + scannedBdevs: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2), + expOutAddrs: addrListFromStrings(vmdBackingAddr1, vmdBackingAddr2), }, "two vmds requested; one has backing devices": { inAddrs: addrListFromStrings(vmdAddr, "0000:85:05.5"), - bdevCache: &storage.BdevScanResponse{ - Controllers: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", - "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", - "850505:11:00.0", "850505:14:00.0"), - }, + scannedBdevs: ctrlrsFromPCIAddrs("850505:07:00.0", "850505:09:00.0", + "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", + "850505:11:00.0", "850505:14:00.0"), expOutAddrs: addrListFromStrings(vmdAddr, "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", "850505:14:00.0"), }, "two vmds requested; both have backing devices": { inAddrs: addrListFromStrings(vmdAddr, "0000:85:05.5"), - bdevCache: &storage.BdevScanResponse{ - Controllers: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2, - "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", - "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", - "850505:14:00.0"), - }, + scannedBdevs: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2, + "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", + "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", + "850505:14:00.0"), expOutAddrs: addrListFromStrings(vmdBackingAddr1, vmdBackingAddr2, "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", @@ -75,12 +67,10 @@ func TestBackend_substituteVMDAddresses(t *testing.T) { }, "input vmd backing devices": { inAddrs: addrListFromStrings(vmdBackingAddr2, vmdBackingAddr1), - bdevCache: &storage.BdevScanResponse{ - Controllers: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2, - "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", - "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", - "850505:14:00.0"), - }, + scannedBdevs: ctrlrsFromPCIAddrs(vmdBackingAddr1, vmdBackingAddr2, + "850505:07:00.0", "850505:09:00.0", "850505:0b:00.0", + "850505:0d:00.0", "850505:0f:00.0", "850505:11:00.0", + "850505:14:00.0"), expOutAddrs: addrListFromStrings(vmdBackingAddr1, vmdBackingAddr2), }, "input vmd backing devices; no cache": { @@ -92,7 +82,7 @@ func TestBackend_substituteVMDAddresses(t *testing.T) { log, buf := logging.NewTestLogger(name) defer test.ShowBufferOnFailure(t, buf) - gotAddrs, gotErr := substituteVMDAddresses(log, tc.inAddrs, tc.bdevCache) + gotAddrs, gotErr := substituteVMDAddresses(log, tc.inAddrs, tc.scannedBdevs) test.CmpErr(t, tc.expErr, gotErr) if gotErr != nil { return diff --git a/src/control/server/storage/bdev/firmware.go b/src/control/server/storage/bdev/firmware.go index 2179c71f875..1d59998e4ab 100644 --- a/src/control/server/storage/bdev/firmware.go +++ b/src/control/server/storage/bdev/firmware.go @@ -91,7 +91,7 @@ func getDeviceController(pciAddr string, controllers storage.NvmeControllers) (* } } - return nil, storage.FaultBdevNotFound(pciAddr) + return nil, storage.FaultBdevNotFound(false, pciAddr) } func filterControllersByModelFirmware(controllers storage.NvmeControllers, modelID, fwRev string) storage.NvmeControllers { diff --git a/src/control/server/storage/bdev/firmware_test.go b/src/control/server/storage/bdev/firmware_test.go index 419d7eb9eab..35c4df09a0a 100644 --- a/src/control/server/storage/bdev/firmware_test.go +++ b/src/control/server/storage/bdev/firmware_test.go @@ -293,7 +293,7 @@ func TestProvider_UpdateFirmware(t *testing.T) { backendCfg: &MockBackendConfig{ ScanRes: &storage.BdevScanResponse{Controllers: defaultDevs}, }, - expErr: storage.FaultBdevNotFound("fake"), + expErr: storage.FaultBdevNotFound(false, "fake"), }, "request duplicates": { input: storage.NVMeFirmwareUpdateRequest{ diff --git a/src/control/server/storage/bdev_test.go b/src/control/server/storage/bdev_test.go index 4341947b370..8f307632b1e 100644 --- a/src/control/server/storage/bdev_test.go +++ b/src/control/server/storage/bdev_test.go @@ -126,7 +126,7 @@ func Test_LedState(t *testing.T) { // Test_Convert_SmdDevice verifies proto->native and native->native JSON conversions. func Test_Convert_SmdDevice(t *testing.T) { - native := MockSmdDevice(test.MockPCIAddr(1)) + native := MockSmdDevice(MockNvmeController(1)) origTgts := native.TargetIDs // Validate target IDs get de-duplicated and HasSysXS set appropriately native.TargetIDs = append(native.TargetIDs, sysXSTgtID, native.TargetIDs[0]) @@ -167,19 +167,37 @@ func Test_Convert_SmdDevice(t *testing.T) { t.Fatalf("expected new device to match original (-want, +got):\n%s\n", diff) } + newNative.Ctrlr.Serial = "" + out, err := json.Marshal(newNative) if err != nil { t.Fatal(err) } - expOut := `{"role_bits":7,"uuid":"00000001-0001-0001-0001-000000000001","tgt_ids":[5,6,7,8],` + - `"rank":0,"total_bytes":0,"avail_bytes":0,"usable_bytes":0,"cluster_size":0,` + - `"meta_size":0,"meta_wal_size":0,"rdb_size":0,"rdb_wal_size":0,` + - `"roles":"data,meta,wal","has_sys_xs":true,"ctrlr":{"info":"","model":"",` + - `"serial":"","pci_addr":"0000:01:00.0","fw_rev":"","vendor_id":"","pci_type":"",` + - `"socket_id":0,"health_stats":null,"namespaces":null,"smd_devices":null,` + - `"dev_state":"EVICTED","led_state":"ON"},"ctrlr_namespace_id":0}` + expOut := `{"role_bits":7,"uuid":"00000001-0001-0001-0001-000000000001","` + + `tgt_ids":[5,6,7,8],"rank":0,"total_bytes":0,"avail_bytes":0,"` + + `usable_bytes":0,"cluster_size":0,"meta_size":0,"meta_wal_size":0,"` + + `rdb_size":0,"rdb_wal_size":0,"roles":"data,meta,wal","has_sys_xs"` + + `:true,"ctrlr":{"info":"","model":"model-1","serial":"","pci_addr` + + `":"0000:01:00.0","fw_rev":"fwRev-1","vendor_id":"","pci_type":""` + + `,"socket_id":1,"health_stats":{"timestamp":0,"warn_temp_time":1,"` + + `crit_temp_time":1,"ctrl_busy_time":1,"power_cycles":1,"power_on_hours":1,"` + + `unsafe_shutdowns":1,"media_errs":1,"err_log_entries":1,"bio_read_errs` + + `":1,"bio_write_errs":1,"bio_unmap_errs":1,"checksum_errs":1,"` + + `temperature":1,"temp_warn":true,"avail_spare_warn":true,"dev_reliability` + + `_warn":true,"read_only_warn":true,"volatile_mem_warn":true` + + `,"program_fail_cnt_norm":1,"program_fail_cnt_raw":1,"erase_fail` + + `_cnt_norm":1,"erase_fail_cnt_raw":1,"wear_leveling_cnt_norm":1,` + + `"wear_leveling_cnt_min":1,"wear_leveling_cnt_max":1,"wear_leveling` + + `_cnt_avg":1,"endtoend_err_cnt_raw":1,"crc_err_cnt_raw":1,"media` + + `_wear_raw":1,"host_reads_raw":1,"workload_timer_raw":1,"thermal` + + `_throttle_status":1,"thermal_throttle_event_cnt":1,"retry_buffer` + + `_overflow_cnt":1,"pll_lock_loss_cnt":1,"nand_bytes_written":1,"` + + `host_bytes_written":1,"cluster_size":0,"meta_wal_size":0,"rdb_wal` + + `_size":0},"namespaces":[{"id":1,"size":2000000000000}],"smd_devices` + + `":null,"dev_state":"EVICTED","led_state":"ON"},"ctrlr_namespace` + + `_id":0}` if diff := cmp.Diff(expOut, string(out)); diff != "" { - t.Fatalf("expected json output to be human readable (-want, +got):\n%s\n", diff) + t.Fatalf("expected json output to match (-want, +got):\n%s\n", diff) } } @@ -187,12 +205,9 @@ func Test_NvmeController_Update(t *testing.T) { mockCtrlrs := MockNvmeControllers(5) // Verify in-place update. - test.AssertEqual(t, mockCtrlrs[1].SmdDevices[0].UUID, test.MockUUID(1), "unexpected uuid") c1 := MockNvmeController(1) - c1.SmdDevices[0].UUID = test.MockUUID(10) mockCtrlrs.Update(*c1) test.AssertEqual(t, len(mockCtrlrs), 5, "expected 5") - test.AssertEqual(t, mockCtrlrs[1].SmdDevices[0].UUID, test.MockUUID(10), "unexpected uuid") // Verify multiple new controllers are added. c2 := MockNvmeController(6) diff --git a/src/control/server/storage/faults.go b/src/control/server/storage/faults.go index cbf029c93f2..1bb42c20d2b 100644 --- a/src/control/server/storage/faults.go +++ b/src/control/server/storage/faults.go @@ -189,14 +189,18 @@ func FaultBdevConfigBadNrRoles(role string, gotNr, wantNr int) *fault.Fault { role, wantNr)) } -// FaultBdevNotFound creates a Fault for the case where no NVMe storage devices -// match expected PCI addresses. -func FaultBdevNotFound(bdevs ...string) *fault.Fault { +// FaultBdevNotFound creates a Fault for the case where no NVMe storage devices match expected PCI +// addresses. VMD addresses are expected to have backing devices. +func FaultBdevNotFound(vmdEnabled bool, bdevs ...string) *fault.Fault { + msg := fmt.Sprintf("NVMe SSD%s", common.Pluralise("", len(bdevs))) + if vmdEnabled { + msg = "backing devices for VMDs" + } + return storageFault( code.BdevNotFound, - fmt.Sprintf("NVMe SSD%s %v not found", common.Pluralise("", len(bdevs)), bdevs), - fmt.Sprintf("check SSD%s %v that are specified in server config exist", - common.Pluralise("", len(bdevs)), bdevs), + fmt.Sprintf("%s %v not found", msg, bdevs), + fmt.Sprintf("check %s %v that are specified in server config exist", msg, bdevs), ) } diff --git a/src/control/server/storage/mocks.go b/src/control/server/storage/mocks.go index 57fec249ed1..bd528937c0b 100644 --- a/src/control/server/storage/mocks.go +++ b/src/control/server/storage/mocks.go @@ -96,19 +96,18 @@ func MockNvmeNamespace(varIdx ...int32) *NvmeNamespace { } // MockSmdDevice returns struct with examples values. -func MockSmdDevice(parentTrAddr string, varIdx ...int32) *SmdDevice { +func MockSmdDevice(c *NvmeController, varIdx ...int32) *SmdDevice { idx := test.GetIndex(varIdx...) startTgt := (idx * 4) + 1 - return &SmdDevice{ + sd := SmdDevice{ UUID: test.MockUUID(idx), TargetIDs: []int32{startTgt, startTgt + 1, startTgt + 2, startTgt + 3}, Roles: BdevRoles{OptionBits(BdevRoleAll)}, - Ctrlr: NvmeController{ - NvmeState: NvmeStateNormal, - LedState: LedStateNormal, - PciAddr: parentTrAddr, - }, } + if c != nil { + sd.Ctrlr = *c + } + return &sd } // MockNvmeController returns struct with examples values. @@ -126,7 +125,6 @@ func MockNvmeController(varIdx ...int32) *NvmeController { LedState: LedStateNormal, HealthStats: MockNvmeHealth(idx), Namespaces: []*NvmeNamespace{MockNvmeNamespace(1)}, - SmdDevices: []*SmdDevice{MockSmdDevice(pciAddr, idx)}, } } diff --git a/src/control/server/storage/provider.go b/src/control/server/storage/provider.go index 6856491b8b7..5d99ae38e9b 100644 --- a/src/control/server/storage/provider.go +++ b/src/control/server/storage/provider.go @@ -41,7 +41,6 @@ type Provider struct { metadata MetadataProvider scm ScmProvider bdev BdevProvider - bdevCache BdevScanResponse vmdEnabled bool } @@ -479,7 +478,7 @@ type BdevTierFormatResult struct { // FormatBdevTiers formats all the Bdev tiers in the engine storage // configuration. -func (p *Provider) FormatBdevTiers() (results []BdevTierFormatResult) { +func (p *Provider) FormatBdevTiers(ctrlrs NvmeControllers) (results []BdevTierFormatResult) { bdevCfgs := p.engineStorage.Tiers.BdevConfigs() results = make([]BdevTierFormatResult, len(bdevCfgs)) @@ -492,20 +491,20 @@ func (p *Provider) FormatBdevTiers() (results []BdevTierFormatResult) { p.log.Infof("Instance %d: starting format of %s block devices %v", p.engineIndex, cfg.Class, cfg.Bdev.DeviceList) - results[i].Tier = cfg.Tier req, err := BdevFormatRequestFromConfig(p.log, cfg) if err != nil { results[i].Error = err p.log.Errorf("Instance %d: format failed (%s)", err) continue } + req.ScannedBdevs = ctrlrs p.RLock() - req.BdevCache = &p.bdevCache req.VMDEnabled = p.vmdEnabled results[i].Result, results[i].Error = p.bdev.Format(req) p.RUnlock() + results[i].Tier = cfg.Tier if err := results[i].Error; err != nil { p.log.Errorf("Instance %d: format failed (%s)", err) continue @@ -595,7 +594,7 @@ func BdevWriteConfigRequestFromConfig(ctx context.Context, log logging.Logger, c // WriteNvmeConfig creates an NVMe config file which describes what devices // should be used by a DAOS engine process. -func (p *Provider) WriteNvmeConfig(ctx context.Context, log logging.Logger) error { +func (p *Provider) WriteNvmeConfig(ctx context.Context, log logging.Logger, ctrlrs NvmeControllers) error { p.RLock() vmdEnabled := p.vmdEnabled engineIndex := p.engineIndex @@ -607,17 +606,11 @@ func (p *Provider) WriteNvmeConfig(ctx context.Context, log logging.Logger) erro if err != nil { return errors.Wrap(err, "creating write config request") } - if req == nil { - return errors.New("BdevWriteConfigRequestFromConfig returned nil request") - } + req.ScannedBdevs = ctrlrs log.Infof("Writing NVMe config file for engine instance %d to %q", engineIndex, req.ConfigOutputPath) - p.RLock() - defer p.RUnlock() - req.BdevCache = &p.bdevCache - _, err = p.bdev.WriteConfig(*req) return err @@ -629,88 +622,6 @@ type BdevTierScanResult struct { Result *BdevScanResponse } -type scanFn func(BdevScanRequest) (*BdevScanResponse, error) - -func scanBdevTiers(log logging.Logger, vmdEnabled, direct bool, cfg *Config, cache *BdevScanResponse, scan scanFn) ([]BdevTierScanResult, error) { - if cfg == nil { - return nil, errors.New("nil storage config") - } - if cfg.Tiers == nil { - return nil, errors.New("nil storage config tiers") - } - - bdevs := cfg.GetBdevs() - if bdevs.Len() == 0 { - return nil, errors.New("scanBdevTiers should not be called if no bdevs in config") - } - - var bsr BdevScanResponse - scanOrCache := "scanned" - if direct { - req := BdevScanRequest{ - DeviceList: bdevs, - VMDEnabled: vmdEnabled, - } - resp, err := scan(req) - if err != nil { - return nil, err - } - bsr = *resp - } else { - if cache == nil { - cache = &BdevScanResponse{} - } - bsr = *cache - scanOrCache = "cached" - } - log.Debugf("bdevs in cfg: %s, %s: %+v", bdevs, scanOrCache, bsr) - - // Build slice of bdevs-per-tier from the entire scan response. - - bdevCfgs := cfg.Tiers.BdevConfigs() - results := make([]BdevTierScanResult, 0, len(bdevCfgs)) - resultBdevCount := 0 - for _, bc := range bdevCfgs { - if bc.Bdev.DeviceList.Len() == 0 { - continue - } - fbsr, err := filterBdevScanResponse(bc.Bdev.DeviceList, &bsr) - if err != nil { - return nil, errors.Wrapf(err, "filter scan cache for tier-%d", bc.Tier) - } - results = append(results, BdevTierScanResult{ - Tier: bc.Tier, Result: fbsr, - }) - - // Keep tally of total number of controllers added to results. - cpas, err := fbsr.Controllers.Addresses() - if err != nil { - return nil, errors.Wrap(err, "get controller pci addresses") - } - cpas, err = cpas.BackingToVMDAddresses() - if err != nil { - return nil, errors.Wrap(err, "convert backing device to vmd domain addresses") - } - resultBdevCount += cpas.Len() - } - - if resultBdevCount != bdevs.Len() { - log.Noticef("Unexpected scan results, wanted %d controllers got %d", bdevs.Len(), - resultBdevCount) - } - - return results, nil -} - -// ScanBdevTiers scans all Bdev tiers in the provider's engine storage configuration. -// If direct is set to true, bypass cache to retrieve up-to-date details. -func (p *Provider) ScanBdevTiers(direct bool) (results []BdevTierScanResult, err error) { - p.RLock() - defer p.RUnlock() - - return scanBdevTiers(p.log, p.vmdEnabled, direct, p.engineStorage, &p.bdevCache, p.bdev.Scan) -} - // ScanBdevs calls into bdev storage provider to scan SSDs, always bypassing cache. // Function should not be called when engines have been started and SSDs have been claimed by SPDK. func (p *Provider) ScanBdevs(req BdevScanRequest) (*BdevScanResponse, error) { @@ -721,39 +632,17 @@ func (p *Provider) ScanBdevs(req BdevScanRequest) (*BdevScanResponse, error) { return p.bdev.Scan(req) } -func (p *Provider) GetBdevCache() BdevScanResponse { - p.RLock() - defer p.RUnlock() - - return p.bdevCache -} - -// SetBdevCache stores given scan response in provider bdev cache. -func (p *Provider) SetBdevCache(resp BdevScanResponse) error { - p.Lock() - defer p.Unlock() - - // Enumerate scan results and filter out any controllers not specified in provider's engine - // storage config. - fResp, err := filterBdevScanResponse(p.engineStorage.GetBdevs(), &resp) - if err != nil { - return errors.Wrap(err, "filtering scan response before caching") - } - - p.log.Debugf("setting bdev cache in storage provider for engine %d: %v", p.engineIndex, - fResp.Controllers) - p.bdevCache = *fResp - p.vmdEnabled = resp.VMDEnabled - - return nil -} - // WithVMDEnabled enables VMD on storage provider. -func (p *Provider) WithVMDEnabled() *Provider { - p.vmdEnabled = true +func (p *Provider) WithVMDEnabled(b bool) *Provider { + p.vmdEnabled = b return p } +// IsVMDEnabled queries whether VMD is enabled on storage provider. +func (p *Provider) IsVMDEnabled() bool { + return p.vmdEnabled +} + func (p *Provider) BdevRoleMetaConfigured() bool { bdevConfigs := p.GetBdevConfigs() for _, bc := range bdevConfigs { diff --git a/src/control/server/storage/provider_test.go b/src/control/server/storage/provider_test.go index 5ed2fb783d9..688563161f1 100644 --- a/src/control/server/storage/provider_test.go +++ b/src/control/server/storage/provider_test.go @@ -7,9 +7,7 @@ package storage import ( - "fmt" "os" - "strings" "testing" "github.com/google/go-cmp/cmp" @@ -33,305 +31,6 @@ func defBdevCmpOpts() []cmp.Option { } } -func Test_scanBdevsTiers(t *testing.T) { - for name, tc := range map[string]struct { - direct bool - vmdEnabled bool - cfg *Config - cache *BdevScanResponse - scanResp *BdevScanResponse - scanErr error - expResults []BdevTierScanResult - expErr error - expNotice bool - }{ - "nil cfg": { - expErr: errors.New("nil storage config"), - }, - "nil cfg tiers": { - cfg: new(Config), - expErr: errors.New("nil storage config tiers"), - }, - "no bdev configs": { - cfg: &Config{ - Tiers: TierConfigs{mockScmTier}, - }, - expErr: errors.New("no bdevs in config"), - }, - "use cache; nil scan cache": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(3)), - }, - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: NvmeControllers{}, - }, - }, - }, - expNotice: true, - }, - "bypass cache; missing controller": { - direct: true, - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(3)), - }, - }, - cache: &BdevScanResponse{ - Controllers: MockNvmeControllers(3), - }, - scanResp: &BdevScanResponse{ - Controllers: MockNvmeControllers(2), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: NvmeControllers{}, - }, - }, - }, - expNotice: true, - }, - "bypass cache": { - direct: true, - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2)), - }, - }, - cache: &BdevScanResponse{ - Controllers: MockNvmeControllers(2), - }, - scanResp: &BdevScanResponse{ - Controllers: MockNvmeControllers(3), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(2), - }, - }, - }, - }, - }, - "bypass cache; scan error": { - direct: true, - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2)), - }, - }, - scanResp: &BdevScanResponse{ - Controllers: MockNvmeControllers(3), - }, - scanErr: errors.New("fail"), - expErr: errors.New("fail"), - }, - "use cache; missing controller": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2)), - }, - }, - cache: &BdevScanResponse{ - Controllers: MockNvmeControllers(2), - }, - scanResp: &BdevScanResponse{ - Controllers: MockNvmeControllers(3), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{}, - }, - }, - }, - expNotice: true, - }, - "use cache": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2)), - }, - }, - cache: &BdevScanResponse{ - Controllers: MockNvmeControllers(3), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(2), - }, - }, - }, - }, - }, - "bypass cache; multi-tier": { - direct: true, - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2), test.MockPCIAddr(3)), - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(4), test.MockPCIAddr(5)), - }, - }, - scanResp: &BdevScanResponse{ - Controllers: MockNvmeControllers(6), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(2), MockNvmeController(3), - }, - }, - }, - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(4), MockNvmeController(5), - }, - }, - }, - }, - }, - "use cache; multi-tier": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(2), test.MockPCIAddr(3)), - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList(test.MockPCIAddr(4), test.MockPCIAddr(5)), - }, - }, - cache: &BdevScanResponse{ - Controllers: MockNvmeControllers(6), - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(2), MockNvmeController(3), - }, - }, - }, - { - Result: &BdevScanResponse{ - Controllers: []*NvmeController{ - MockNvmeController(4), MockNvmeController(5), - }, - }, - }, - }, - }, - "use cache; vmd domain missing in scan": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList("0000:62:00.5", "0000:63:00.5"), - }, - }, - cache: &BdevScanResponse{ - Controllers: NvmeControllers{ - &NvmeController{PciAddr: "620005:83:00.0"}, - &NvmeController{PciAddr: "620005:85:00.0"}, - &NvmeController{PciAddr: "620005:87:00.0"}, - &NvmeController{PciAddr: "620005:81:00.0"}, - }, - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: NvmeControllers{ - &NvmeController{PciAddr: "620005:83:00.0"}, - &NvmeController{PciAddr: "620005:85:00.0"}, - &NvmeController{PciAddr: "620005:87:00.0"}, - &NvmeController{PciAddr: "620005:81:00.0"}, - }, - }, - }, - }, - expNotice: true, - }, - "use cache; multiple devices behind vmd domain": { - cfg: &Config{ - Tiers: TierConfigs{ - mockScmTier, - NewTierConfig().WithStorageClass(ClassNvme.String()). - WithBdevDeviceList("0000:62:00.5"), - }, - }, - cache: &BdevScanResponse{ - Controllers: NvmeControllers{ - &NvmeController{PciAddr: "620005:83:00.0"}, - &NvmeController{PciAddr: "620005:85:00.0"}, - &NvmeController{PciAddr: "620005:87:00.0"}, - &NvmeController{PciAddr: "620005:81:00.0"}, - }, - }, - expResults: []BdevTierScanResult{ - { - Result: &BdevScanResponse{ - Controllers: NvmeControllers{ - &NvmeController{PciAddr: "620005:83:00.0"}, - &NvmeController{PciAddr: "620005:85:00.0"}, - &NvmeController{PciAddr: "620005:87:00.0"}, - &NvmeController{PciAddr: "620005:81:00.0"}, - }, - }, - }, - }, - }, - } { - t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(name) - defer test.ShowBufferOnFailure(t, buf) - - scanFn := func(r BdevScanRequest) (*BdevScanResponse, error) { - return tc.scanResp, tc.scanErr - } - - gotResults, gotErr := scanBdevTiers(log, tc.vmdEnabled, tc.direct, tc.cfg, tc.cache, scanFn) - test.CmpErr(t, tc.expErr, gotErr) - if gotErr != nil { - return - } - - if diff := cmp.Diff(tc.expResults, gotResults, defBdevCmpOpts()...); diff != "" { - t.Fatalf("\nunexpected results (-want, +got):\n%s\n", diff) - } - - txtMod := "" - if !tc.expNotice { - txtMod = "not " - } - msg := fmt.Sprintf("expected NOTICE level message to %shave been logged", txtMod) - test.AssertEqual(t, tc.expNotice, strings.Contains(buf.String(), "NOTICE"), msg) - }) - } -} - func Test_BdevWriteRequestFromConfig(t *testing.T) { hostname, err := os.Hostname() if err != nil { diff --git a/src/control/server/util_test.go b/src/control/server/util_test.go index f34c6d16f66..67037a17253 100644 --- a/src/control/server/util_test.go +++ b/src/control/server/util_test.go @@ -200,14 +200,14 @@ func newTestEngine(log logging.Logger, isAP bool, provider *storage.Provider, en rCfg.Running.SetTrue() r := engine.NewTestRunner(rCfg, engineCfg[0]) - srv := NewEngineInstance(log, provider, nil, r) - srv.setSuperblock(&Superblock{ + e := NewEngineInstance(log, provider, nil, r) + e.setSuperblock(&Superblock{ Rank: ranklist.NewRankPtr(0), }) - srv.ready.SetTrue() - srv.OnReady() + e.ready.SetTrue() + e.OnReady() - return srv + return e } // mockTCPResolver returns successful resolve results for any input. diff --git a/src/dtx/SConscript b/src/dtx/SConscript index 5a6849671de..4d0f1f2dcb3 100644 --- a/src/dtx/SConscript +++ b/src/dtx/SConscript @@ -18,7 +18,8 @@ def scons(): # dtx denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) dtx = denv.d_library('dtx', - ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c'], + ['dtx_srv.c', 'dtx_rpc.c', 'dtx_resync.c', 'dtx_common.c', 'dtx_cos.c', + 'dtx_coll.c'], install_off="../..") denv.Install('$PREFIX/lib64/daos_srv', dtx) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c new file mode 100644 index 00000000000..447d3682849 --- /dev/null +++ b/src/dtx/dtx_coll.c @@ -0,0 +1,373 @@ +/** + * (C) Copyright 2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * dtx: DTX collective RPC logic + */ +#define D_LOGFAC DD_FAC(dtx) + +#include +#include +#include +#include +#include +#include +#include +#include "dtx_internal.h" + +/* + * For collective DTX, when commit/abort/check the DTX on system XS (on non-leader), we cannot + * directly locate the DTX entry since no VOS target is attached to system XS. Under such case, + * we have two options: + * + * 1. The DTX leader (on IO XS) knows on which VOS target the non-leader can find out the DTX + * entry. So DTX leader can send related information (IO XS index) to the non-leader. + * + * 2. The non-leader can start ULT on every local XS collectively to find the DTX by force in + * spite of whether related DTX entry really exists on the VOS target or not. + * + * Usually, the 2nd option may cause more overhead, should be avoid. Then the 1st is relative + * better choice. On the other hand, if there are a lot of VOS targets in the system, then it + * maybe inefficient to send all VOS targets information to all related non-leaders via bcast. + * Instead, we will only send one VOS target information for each non-leader, then non-leader + * can load mbs (dtx_memberships) from the DTX entry and then calculate the other VOS targets + * information by itself. + */ + +struct dtx_coll_local_args { + uuid_t dcla_po_uuid; + uuid_t dcla_co_uuid; + struct dtx_id dcla_xid; + daos_epoch_t dcla_epoch; + uint32_t dcla_opc; + int *dcla_results; +}; + +void +dtx_coll_prep_ult(void *arg) +{ + struct dtx_coll_prep_args *dcpa = arg; + struct dtx_coll_in *dci = crt_req_get(dcpa->dcpa_rpc); + struct dtx_memberships *mbs = NULL; + struct ds_cont_child *cont = NULL; + uint32_t opc = opc_get(dcpa->dcpa_rpc->cr_opc); + int rc = 0; + + dcpa->dcpa_result = ds_cont_child_lookup(dci->dci_po_uuid, dci->dci_co_uuid, &cont); + if (dcpa->dcpa_result != 0) { + D_ERROR("Failed to locate pool="DF_UUID" cont="DF_UUID" for DTX " + DF_DTI" with opc %u: "DF_RC"\n", + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), + DP_DTI(&dci->dci_xid), opc, DP_RC(dcpa->dcpa_result)); + /* + * Convert the case of container non-exist as -DER_IO to distinguish + * the case of DTX entry does not exist. The latter one is normal. + */ + if (dcpa->dcpa_result == -DER_NONEXIST) + dcpa->dcpa_result = -DER_IO; + + goto out; + } + + dcpa->dcpa_result = vos_dtx_load_mbs(cont->sc_hdl, &dci->dci_xid, &dcpa->dcpa_oid, &mbs); + if (dcpa->dcpa_result == -DER_INPROGRESS && !dtx_cont_opened(cont) && + opc == DTX_COLL_CHECK) { + rc = start_dtx_reindex_ult(cont); + if (rc != 0) + D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", + DP_UUID(cont->sc_uuid), DP_RC(rc)); + } + + if (dcpa->dcpa_result != 0) + goto out; + + dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, + dci->dci_version, cont->sc_pool->spc_map_version, + opc == DTX_COLL_CHECK, false, &dcpa->dcpa_dce); + if (dcpa->dcpa_result != 0) + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI" opc %u: "DF_RC"\n", DP_DTI(&dci->dci_xid), opc, + DP_RC(dcpa->dcpa_result)); + +out: + if (cont != NULL) + ds_cont_child_put(cont); + + rc = ABT_future_set(dcpa->dcpa_future, NULL); + D_ASSERT(rc == ABT_SUCCESS); +} + +int +dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dtx_memberships *mbs, + uint32_t my_tgtid, uint32_t dtx_ver, uint32_t pm_ver, bool for_check, bool need_hint, + struct dtx_coll_entry **p_dce) +{ + struct pl_map *map = NULL; + struct pl_obj_layout *layout = NULL; + struct pool_target *target; + struct dtx_daos_target *ddt; + struct dtx_coll_target *dct; + struct dtx_coll_entry *dce = NULL; + struct daos_obj_md md = { 0 }; + uint32_t node_nr; + d_rank_t my_rank = dss_self_rank(); + d_rank_t max_rank = 0; + int rc = 0; + int i; + int j; + + D_ASSERT(mbs->dm_flags & DMF_COLL_TARGET); + + D_ALLOC_PTR(dce); + if (dce == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_xid = *xid; + dce->dce_ver = dtx_ver; + dce->dce_refs = 1; + + ddt = &mbs->dm_tgts[0]; + dct = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + D_ALLOC(dce->dce_bitmap, dct->dct_bitmap_sz); + if (dce->dce_bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_bitmap_sz = dct->dct_bitmap_sz; + + if (!for_check) { + memcpy(dce->dce_bitmap, dct->dct_tgts + dct->dct_tgt_nr, dct->dct_bitmap_sz); + } else { + map = pl_map_find(po_uuid, oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + for (i = 0, j = 0; i < dct->dct_tgt_nr; i++) { + rc = pool_map_find_target(map->pl_poolmap, dct->dct_tgts[i], &target); + D_ASSERT(rc == 1); + + /* Skip the targets that reside on other engines. */ + if (unlikely(target->ta_comp.co_rank != my_rank)) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > dtx_ver) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_NEW && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + /* Skip current (new) leader target. */ + if (my_tgtid != target->ta_comp.co_index) { + setbit(dce->dce_bitmap, target->ta_comp.co_index); + j++; + } + } + + rc = 0; + + if (unlikely(j == 0)) { + D_FREE(dce->dce_bitmap); + dce->dce_bitmap_sz = 0; + } + } + + if (!need_hint) + goto out; + + if (map == NULL) { + map = pl_map_find(po_uuid, oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + } + + node_nr = pool_map_node_nr(map->pl_poolmap); + if (unlikely(node_nr == 1)) + D_GOTO(out, rc = 0); + + dce->dce_ranks = d_rank_list_alloc(node_nr - 1); + if (dce->dce_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + D_ALLOC_ARRAY(dce->dce_hints, node_nr); + if (dce->dce_hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < node_nr; i++) + dce->dce_hints[i] = (uint8_t)(-1); + + md.omd_id = oid.id_pub; + md.omd_ver = pm_ver; + md.omd_fdom_lvl = dct->dct_fdom_lvl; + md.omd_pda = dct->dct_pda; + md.omd_pdom_lvl = dct->dct_pdom_lvl; + + rc = pl_obj_place(map, oid.id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", + DP_OID(oid.id_pub), DP_UUID(po_uuid)); + goto out; + } + + for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, &target); + D_ASSERT(rc == 1); + + /* Skip current leader rank. */ + if (target->ta_comp.co_rank == my_rank) + continue; + + /* Skip the target that (re-)joined the system after the DTX. */ + if (target->ta_comp.co_ver > dtx_ver) + continue; + + /* Skip non-healthy one. */ + if (target->ta_comp.co_status != PO_COMP_ST_UP && + target->ta_comp.co_status != PO_COMP_ST_UPIN && + target->ta_comp.co_status != PO_COMP_ST_NEW && + target->ta_comp.co_status != PO_COMP_ST_DRAIN) + continue; + + if (dce->dce_hints[target->ta_comp.co_rank] == (uint8_t)(-1)) { + dce->dce_hints[target->ta_comp.co_rank] = target->ta_comp.co_index; + dce->dce_ranks->rl_ranks[j++] = target->ta_comp.co_rank; + if (max_rank < target->ta_comp.co_rank) + max_rank = target->ta_comp.co_rank; + } + } + + rc = 0; + + /* + * It is no matter that the real size of rl_ranks array is larger than rl_nr. + * Then reduce rl_nr to skip those non-defined ranks at the tail in rl_ranks. + */ + if (unlikely(j == 0)) { + d_rank_list_free(dce->dce_ranks); + dce->dce_ranks = NULL; + D_FREE(dce->dce_hints); + dce->dce_hint_sz = 0; + } else { + dce->dce_ranks->rl_nr = j; + dce->dce_hint_sz = max_rank + 1; + } + +out: + if (layout != NULL) + pl_obj_layout_free(layout); + + if (map != NULL) + pl_map_decref(map); + + if (rc != 0) + dtx_coll_entry_put(dce); + else + *p_dce = dce; + + return rc; +} + +static int +dtx_coll_local_one(void *args) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct dtx_coll_local_args *dcla = args; + struct ds_cont_child *cont = NULL; + uint32_t opc = dcla->dcla_opc; + int rc; + int rc1; + + rc = ds_cont_child_lookup(dcla->dcla_po_uuid, dcla->dcla_co_uuid, &cont); + if (rc != 0) { + D_ERROR("Failed to locate "DF_UUID"/"DF_UUID" for collective DTX " + DF_DTI" rpc %u: "DF_RC"\n", DP_UUID(dcla->dcla_po_uuid), + DP_UUID(dcla->dcla_co_uuid), DP_DTI(&dcla->dcla_xid), opc, DP_RC(rc)); + goto out; + } + + switch (opc) { + case DTX_COLL_COMMIT: + rc = vos_dtx_commit(cont->sc_hdl, &dcla->dcla_xid, 1, NULL); + break; + case DTX_COLL_ABORT: + rc = vos_dtx_abort(cont->sc_hdl, &dcla->dcla_xid, dcla->dcla_epoch); + break; + case DTX_COLL_CHECK: + rc = vos_dtx_check(cont->sc_hdl, &dcla->dcla_xid, NULL, NULL, NULL, NULL, false); + if (rc == DTX_ST_INITED) { + /* + * For DTX_CHECK, non-ready one is equal to non-exist. Do not directly + * return 'DTX_ST_INITED' to avoid interoperability trouble if related + * request is from old server. + */ + rc = -DER_NONEXIST; + } else if (rc == -DER_INPROGRESS && !dtx_cont_opened(cont)) { + /* Trigger DTX re-index for subsequent (retry) DTX_CHECK. */ + rc1 = start_dtx_reindex_ult(cont); + if (rc1 != 0) + D_ERROR("Failed to trigger DTX reindex for "DF_UUID"/"DF_UUID + " on target %u/%u: "DF_RC"\n", + DP_UUID(dcla->dcla_po_uuid), DP_UUID(dcla->dcla_co_uuid), + dss_self_rank(), dmi->dmi_tgt_id, DP_RC(rc1)); + } + break; + default: + D_ASSERTF(0, "Unknown collective DTX opc %u\n", opc); + D_GOTO(out, rc = -DER_NOTSUPPORTED); + } + +out: + dcla->dcla_results[dmi->dmi_tgt_id] = rc; + if (cont != NULL) + ds_cont_child_put(cont); + + return 0; +} + +int +dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results) +{ + struct dtx_coll_local_args dcla = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int rc; + + D_ALLOC_ARRAY(dcla.dcla_results, dss_tgt_nr); + if (dcla.dcla_results == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + uuid_copy(dcla.dcla_po_uuid, po_uuid); + uuid_copy(dcla.dcla_co_uuid, co_uuid); + dcla.dcla_xid = *xid; + dcla.dcla_epoch = epoch; + dcla.dcla_opc = opc; + + coll_ops.co_func = dtx_coll_local_one; + coll_args.ca_func_args = &dcla; + coll_args.ca_tgt_bitmap_sz = bitmap_sz; + coll_args.ca_tgt_bitmap = bitmap; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Locally exec collective DTX PRC %u for "DF_DTI": "DF_RC"\n", + opc, DP_DTI(xid), DP_RC(rc)); + +out: + *p_results = dcla.dcla_results; + return rc < 0 ? rc : dss_tgt_nr; +} diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 2b549c06c8f..0a3d2b193a7 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -24,7 +24,6 @@ uint32_t dtx_agg_thd_age_up; uint32_t dtx_agg_thd_age_lo; uint32_t dtx_batched_ult_max; - struct dtx_batched_pool_args { /* Link to dss_module_info::dmi_dtx_batched_pool_list. */ d_list_t dbpa_sys_link; @@ -73,13 +72,19 @@ struct dtx_cleanup_cb_args { static inline void dtx_free_committable(struct dtx_entry **dtes, struct dtx_cos_key *dcks, - int count) + struct dtx_coll_entry *dce, int count) { int i; - for (i = 0; i < count; i++) - dtx_entry_put(dtes[i]); - D_FREE(dtes); + if (dce != NULL) { + D_ASSERT(count == 1); + + dtx_coll_entry_put(dce); + } else { + for (i = 0; i < count; i++) + dtx_entry_put(dtes[i]); + D_FREE(dtes); + } D_FREE(dcks); } @@ -109,7 +114,9 @@ dtx_free_dbca(struct dtx_batched_cont_args *dbca) } D_ASSERT(cont->sc_dtx_committable_count == 0); + D_ASSERT(cont->sc_dtx_committable_coll_count == 0); D_ASSERT(d_list_empty(&cont->sc_dtx_cos_list)); + D_ASSERT(d_list_empty(&cont->sc_dtx_coll_list)); /* Even if the container is reopened during current deregister, the * reopen will use new dbca, so current dbca needs to be cleanup. @@ -184,6 +191,7 @@ dtx_stat(struct ds_cont_child *cont, struct dtx_stat *stat) vos_dtx_stat(cont->sc_hdl, stat, DSF_SKIP_BAD); stat->dtx_committable_count = cont->sc_dtx_committable_count; + stat->dtx_committable_coll_count = cont->sc_dtx_committable_coll_count; stat->dtx_oldest_committable_time = dtx_cos_oldest(cont); } @@ -263,6 +271,7 @@ dtx_cleanup_iter_cb(uuid_t co_uuid, vos_iter_entry_t *ent, void *args) dsp->dsp_xid = ent->ie_dtx_xid; dsp->dsp_oid = ent->ie_dtx_oid; dsp->dsp_epoch = ent->ie_epoch; + dsp->dsp_version = ent->ie_dtx_ver; if (ent->ie_dtx_mbs_dsize > DTX_INLINE_MBS_SIZE) goto add; @@ -303,12 +312,14 @@ dtx_dpci_free(struct dtx_partial_cmt_item *dpci) static void dtx_cleanup(void *arg) { + struct dss_module_info *dmi = dss_get_module_info(); struct dtx_batched_cont_args *dbca = arg; struct ds_cont_child *cont = dbca->dbca_cont; struct dtx_share_peer *dsp; struct dtx_partial_cmt_item *dpci; struct dtx_entry *dte; struct dtx_cleanup_cb_args dcca; + daos_unit_oid_t oid; d_list_t cmt_list; d_list_t abt_list; d_list_t act_list; @@ -366,9 +377,24 @@ dtx_cleanup(void *arg) dte = &dpci->dpci_dte; if (dte->dte_mbs == NULL) - rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &dte->dte_mbs); - if (dte->dte_mbs != NULL) - rc = dtx_commit(cont, &dte, NULL, 1); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dte->dte_xid, &oid, &dte->dte_mbs); + if (dte->dte_mbs != NULL) { + if (dte->dte_mbs->dm_flags & DMF_COLL_TARGET) { + struct dtx_coll_entry *dce = NULL; + + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, &dte->dte_xid, + dte->dte_mbs, dmi->dmi_tgt_id, dte->dte_ver, + cont->sc_pool->spc_map_version, false, true, &dce); + if (rc == 0) { + D_ASSERT(dce != NULL); + + rc = dtx_coll_commit(cont, dce, NULL); + dtx_coll_entry_put(dce); + } + } else { + rc = dtx_commit(cont, &dte, NULL, 1); + } + } D_DEBUG(DB_IO, "Cleanup partial committed DTX "DF_DTI", left %d: %d\n", DP_DTI(&dte->dte_xid), dcca.dcca_pc_count, rc); @@ -594,12 +620,13 @@ dtx_batched_commit_one(void *arg) dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; struct dtx_stat stat = { 0 }; int cnt; int rc; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL, - DAOS_EPOCH_MAX, &dtes, &dcks); + DAOS_EPOCH_MAX, &dtes, &dcks, &dce); if (cnt == 0) break; @@ -609,8 +636,15 @@ dtx_batched_commit_one(void *arg) break; } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + /* Currently, commit collective DTX one by one. */ + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); if (rc != 0) { D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n", cnt, DP_UUID(cont->sc_uuid), DP_RC(rc)); @@ -624,6 +658,7 @@ dtx_batched_commit_one(void *arg) sched_req_wakeup(dmi->dmi_dtx_agg_req); if ((stat.dtx_committable_count <= DTX_THRESHOLD_COUNT) && + (stat.dtx_committable_coll_count == 0) && (stat.dtx_oldest_committable_time == 0 || d_hlc_age2sec(stat.dtx_oldest_committable_time) < DTX_COMMIT_THRESHOLD_AGE)) @@ -689,6 +724,7 @@ dtx_batched_commit(void *arg) if (dtx_cont_opened(cont) && dbca->dbca_commit_req == NULL && (dtx_batched_ult_max != 0 && tls->dt_batched_ult_cnt < dtx_batched_ult_max) && ((stat.dtx_committable_count > DTX_THRESHOLD_COUNT) || + (stat.dtx_committable_coll_count > 0) || (stat.dtx_oldest_committable_time != 0 && d_hlc_age2sec(stat.dtx_oldest_committable_time) >= DTX_COMMIT_THRESHOLD_AGE))) { @@ -846,11 +882,9 @@ dtx_handle_reinit(struct dtx_handle *dth) */ static int dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, - uint16_t sub_modification_cnt, uint32_t pm_ver, - daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, - int dti_cos_cnt, struct dtx_memberships *mbs, bool leader, - bool solo, bool sync, bool dist, bool migration, bool ignore_uncommitted, - bool resent, bool prepared, bool drop_cmt, struct dtx_handle *dth) + bool leader, uint16_t sub_modification_cnt, uint32_t pm_ver, + daos_unit_oid_t *leader_oid, struct dtx_id *dti_cos, int dti_cos_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_handle *dth) { if (sub_modification_cnt > DTX_SUB_MOD_MAX) { D_ERROR("Too many modifications in a single transaction:" @@ -871,17 +905,16 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_pinned = 0; dth->dth_cos_done = 0; - dth->dth_resent = resent ? 1 : 0; - dth->dth_solo = solo ? 1 : 0; - dth->dth_drop_cmt = drop_cmt ? 1 : 0; dth->dth_modify_shared = 0; dth->dth_active = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; - dth->dth_dist = dist ? 1 : 0; - dth->dth_for_migration = migration ? 1 : 0; - dth->dth_ignore_uncommitted = ignore_uncommitted ? 1 : 0; - dth->dth_prepared = prepared ? 1 : 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; dth->dth_aborted = 0; dth->dth_already = 0; dth->dth_need_validation = 0; @@ -891,7 +924,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t coh, struct dtx_epoch *epoch, dth->dth_ent = NULL; dth->dth_flags = leader ? DTE_LEADER : 0; - if (sync) { + if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; dth->dth_sync = 1; } else { @@ -1102,20 +1135,19 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) * \param tgt_cnt [IN] number of targets (not count the leader itself). * \param flags [IN] See dtx_flags. * \param mbs [IN] DTX participants information. + * \param dce [IN] The pointer to collective DTX entry. * \param p_dlh [OUT] Pointer to the DTX handle. * * \return Zero on success, negative value if error. */ int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh) +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_coll_entry *dce, + struct dtx_leader_handle **p_dlh) { struct dtx_leader_handle *dlh; - struct dtx_tls *tls = dtx_tls_get(); struct dtx_handle *dth; int rc; int i; @@ -1124,32 +1156,45 @@ dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, if (dlh == NULL) return -DER_NOMEM; + dlh->dlh_future = ABT_FUTURE_NULL; + dlh->dlh_coll_entry = dce; + if (flags & DTX_TGT_COLL) + dlh->dlh_coll = 1; + if (tgt_cnt > 0) { - dlh->dlh_future = ABT_FUTURE_NULL; dlh->dlh_subs = (struct dtx_sub_status *)(dlh + 1); - for (i = 0; i < tgt_cnt; i++) { - dlh->dlh_subs[i].dss_tgt = tgts[i]; - if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) - dlh->dlh_delay_sub_cnt++; + + if (flags & DTX_TGT_COLL) { + /* + * NOTE: Do not support DTF_DELAY_FORWARD for collective DTX. + * The target information will be filled sometime later + * when dispatch related IO request. + */ + dlh->dlh_delay_sub_cnt = 0; + dlh->dlh_normal_sub_cnt = tgt_cnt; + } else { + for (i = 0; i < tgt_cnt; i++) { + dlh->dlh_subs[i].dss_tgt = tgts[i]; + if (unlikely(tgts[i].st_flags & DTF_DELAY_FORWARD)) + dlh->dlh_delay_sub_cnt++; + } + + dlh->dlh_normal_sub_cnt = tgt_cnt - dlh->dlh_delay_sub_cnt; } - dlh->dlh_normal_sub_cnt = tgt_cnt - dlh->dlh_delay_sub_cnt; } + if (flags & DTX_RELAY) + dlh->dlh_relay = 1; + dth = &dlh->dlh_handle; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, pm_ver, - leader_oid, dti_cos, dti_cos_cnt, mbs, true, - (flags & DTX_SOLO) ? true : false, - (flags & DTX_SYNC) ? true : false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, false, - (flags & DTX_RESEND) ? true : false, - (flags & DTX_PREPARED) ? true : false, - (flags & DTX_DROP_CMT) ? true : false, dth); + rc = dtx_handle_init(dti, coh, epoch, dlh->dlh_relay ? false : true, sub_modification_cnt, + pm_ver, leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, (flags & DTX_PREPARED) ? true : false); - D_DEBUG(DB_IO, "Start DTX "DF_DTI" sub modification %d, ver %u, epoch "DF_X64", leader " - DF_UOID", dti_cos_cnt %d, tgt_cnt %d, flags %x: "DF_RC"\n", + D_DEBUG(DB_IO, "Start (%s) DTX "DF_DTI" sub modification %d, ver %u, epoch " + DF_X64", leader "DF_UOID", dti_cos_cnt %d, tgt_cnt %d, flags %x: "DF_RC"\n", + dlh->dlh_coll ? (dlh->dlh_relay ? "relay" : "collective") : "regular", DP_DTI(dti), sub_modification_cnt, dth->dth_ver, epoch->oe_value, DP_UOID(*leader_oid), dti_cos_cnt, tgt_cnt, flags, DP_RC(rc)); @@ -1157,7 +1202,7 @@ dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, D_FREE(dlh); } else { *p_dlh = dlh; - d_tm_inc_gauge(tls->dt_dtx_leader_total, 1); + d_tm_inc_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); } return rc; @@ -1182,17 +1227,6 @@ dtx_leader_wait(struct dtx_leader_handle *dlh) return dlh->dlh_result; }; -void -dtx_entry_put(struct dtx_entry *dte) -{ - if (--(dte->dte_refs) == 0) { - struct dtx_tls *tls = dtx_tls_get(); - - d_tm_dec_gauge(tls->dt_dtx_entry_total, 1); - D_FREE(dte); - } -} - /** * Stop the leader thandle. * @@ -1207,7 +1241,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul { struct ds_cont_child *cont = coh->sch_cont; struct dtx_handle *dth = &dlh->dlh_handle; - struct dtx_tls *tls = dtx_tls_get(); struct dtx_entry *dte; struct dtx_memberships *mbs; size_t size; @@ -1221,7 +1254,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul dtx_shares_fini(dth); - if (daos_is_zero_dti(&dth->dth_xid) || unlikely(result == -DER_ALREADY)) + if (daos_is_zero_dti(&dth->dth_xid) || unlikely(result == -DER_ALREADY) || dlh->dlh_relay) goto out; if (unlikely(coh->sch_closed)) { @@ -1275,24 +1308,11 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_ASSERTF(0, "Unexpected DTX "DF_DTI" status %d\n", DP_DTI(&dth->dth_xid), status); } - if ((!dth->dth_active && dth->dth_dist) || dth->dth_prepared || dtx_batched_ult_max == 0) { - /* We do not know whether some other participants have - * some active entry for this DTX, consider distributed - * transaction case, the other participants may execute - * different operations. Sync commit the DTX for safe. - */ + if (dth->dth_prepared || dtx_batched_ult_max == 0) { dth->dth_sync = 1; goto sync; } - /* For standalone modification, if leader modified nothing, then - * non-leader(s) must be the same, unpin the DTX via dtx_abort(). - */ - if (!dth->dth_active) { - unpin = true; - D_GOTO(abort, result = 0); - } - if (DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE)) D_GOTO(abort, result = 0); @@ -1310,45 +1330,42 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_ASSERT(dth->dth_mbs != NULL); - size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; - D_ALLOC(dte, size); - if (dte == NULL) { - dth->dth_sync = 1; - goto sync; - } + if (dlh->dlh_coll) { + rc = dtx_add_cos(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid, + dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL); + } else { + size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; + D_ALLOC(dte, size); + if (dte == NULL) { + dth->dth_sync = 1; + goto sync; + } - mbs = (struct dtx_memberships *)(dte + 1); - memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); + mbs = (struct dtx_memberships *)(dte + 1); + memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); - dte->dte_xid = dth->dth_xid; - dte->dte_ver = dth->dth_ver; - dte->dte_refs = 1; - dte->dte_mbs = mbs; - d_tm_inc_gauge(tls->dt_dtx_entry_total, 1); + dte->dte_xid = dth->dth_xid; + dte->dte_ver = dth->dth_ver; + dte->dte_refs = 1; + dte->dte_mbs = mbs; - /* Use the new created @dte instead of dth->dth_dte that will be - * released after dtx_leader_end(). - */ + if (!(mbs->dm_flags & DMF_SRDG_REP)) + flags = DCF_EXP_CMT; + else if (dth->dth_modify_shared) + flags = DCF_SHARED; + else + flags = 0; + + rc = dtx_add_cos(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, + dth->dth_epoch, flags); + dtx_entry_put(dte); + } - if (!(mbs->dm_flags & DMF_SRDG_REP)) - flags = DCF_EXP_CMT; - else if (dth->dth_modify_shared) - flags = DCF_SHARED; - else - flags = 0; - rc = dtx_add_cos(cont, dte, &dth->dth_leader_oid, - dth->dth_dkey_hash, dth->dth_epoch, flags); - dtx_entry_put(dte); if (rc == 0) { if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { vos_dtx_mark_committable(dth); - if (cont->sc_dtx_committable_count > - DTX_THRESHOLD_COUNT) { - struct dss_module_info *dmi; - - dmi = dss_get_module_info(); - sched_req_wakeup(dmi->dmi_dtx_cmt_req); - } + if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) + sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); } } else { dth->dth_sync = 1; @@ -1362,11 +1379,18 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * batched commit. */ vos_dtx_mark_committable(dth); - dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + + if (dlh->dlh_coll) { + rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL); + } else { + dte = &dth->dth_dte; + rc = dtx_commit(cont, &dte, NULL, 1); + } + if (rc != 0) - D_WARN(DF_UUID": Fail to sync commit DTX "DF_DTI": "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1391,7 +1415,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * 2. Remove the pinned DTX entry. */ vos_dtx_cleanup(dth, true); - dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); + if (dlh->dlh_coll) + dtx_coll_abort(cont, dlh->dlh_coll_entry, dth->dth_epoch); + else + dtx_abort(cont, &dth->dth_dte, dth->dth_epoch); aborted = true; } @@ -1436,7 +1463,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_FREE(dth->dth_oid_array); D_FREE(dlh); - d_tm_dec_gauge(tls->dt_dtx_leader_total, 1); + d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); return result; } @@ -1473,13 +1500,8 @@ dtx_begin(daos_handle_t coh, struct dtx_id *dti, if (dth == NULL) return -DER_NOMEM; - rc = dtx_handle_init(dti, coh, epoch, sub_modification_cnt, - pm_ver, leader_oid, dti_cos, dti_cos_cnt, mbs, - false, false, false, - (flags & DTX_DIST) ? true : false, - (flags & DTX_FOR_MIGRATION) ? true : false, - (flags & DTX_IGNORE_UNCOMMITTED) ? true : false, - (flags & DTX_RESEND) ? true : false, false, false, dth); + rc = dtx_handle_init(dti, coh, epoch, false, sub_modification_cnt, pm_ver, + leader_oid, dti_cos, dti_cos_cnt, flags, mbs, dth); if (rc == 0 && sub_modification_cnt > 0) rc = vos_dtx_attach(dth, false, false); @@ -1567,9 +1589,10 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db while (dbca->dbca_reg_gen == cont->sc_dtx_batched_gen && rc >= 0) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, - NULL, DAOS_EPOCH_MAX, &dtes, &dcks); + NULL, DAOS_EPOCH_MAX, &dtes, &dcks, &dce); if (cnt <= 0) D_GOTO(out, rc = cnt); @@ -1586,8 +1609,14 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db D_GOTO(out, rc = -DER_MISC); } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); } out: @@ -1734,7 +1763,9 @@ dtx_cont_register(struct ds_cont_child *cont) } cont->sc_dtx_committable_count = 0; + cont->sc_dtx_committable_coll_count = 0; D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list); + D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list); ds_cont_child_get(cont); dbca->dbca_refs = 0; dbca->dbca_cont = cont; @@ -1939,8 +1970,12 @@ dtx_comp_cb(void **arg) sub->dss_result == dlh->dlh_allow_failure) continue; - /* Ignore DER_INPROGRESS if there is other failure. */ - if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS) + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + + /* Ignore DER_INPROGRESS and DER_AGAIN if there is other failure. */ + if (dlh->dlh_result == 0 || dlh->dlh_result == -DER_INPROGRESS || + dlh->dlh_result == -DER_AGAIN) dlh->dlh_result = sub->dss_result; } } @@ -2206,9 +2241,10 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, while (dtx_cont_opened(cont)) { struct dtx_entry **dtes = NULL; struct dtx_cos_key *dcks = NULL; + struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, oid, - epoch, &dtes, &dcks); + epoch, &dtes, &dcks, &dce); if (cnt <= 0) { rc = cnt; if (rc < 0) @@ -2217,8 +2253,14 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, break; } - rc = dtx_commit(cont, dtes, dcks, cnt); - dtx_free_committable(dtes, dcks, cnt); + if (dce != NULL) { + D_ASSERT(cnt == 1); + + rc = dtx_coll_commit(cont, dce, dcks); + } else { + rc = dtx_commit(cont, dtes, dcks, cnt); + } + dtx_free_committable(dtes, dcks, dce, cnt); if (rc < 0) { D_ERROR("Fail to commit dtx: "DF_RC"\n", DP_RC(rc)); break; @@ -2230,3 +2272,117 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, return rc; } + +void +dtx_merge_check_result(int *tgt, int src) +{ + /* As long as one target has committed, then the DTX is committable on all targets. */ + if (*tgt != DTX_ST_COMMITTED && *tgt != DTX_ST_COMMITTABLE) { + switch (src) { + case DTX_ST_COMMITTED: + case DTX_ST_COMMITTABLE: + *tgt = src; + break; + case -DER_EXCLUDED: + /* + * If non-leader is excluded, handle it as 'prepared'. If other + * non-leaders are also 'prepared' then related DTX maybe still + * committable or 'corrupted'. The subsequent DTX resync logic + * will handle related things, see dtx_verify_groups(). + * + * Fall through. + */ + case DTX_ST_PREPARED: + if (*tgt == 0 || *tgt == DTX_ST_CORRUPTED) + *tgt = src; + break; + case DTX_ST_CORRUPTED: + if (*tgt == 0) + *tgt = src; + break; + default: + if (src >= 0) { + if (*tgt != -DER_NONEXIST) + *tgt = -DER_IO; + } else { + if (src == -DER_NONEXIST || *tgt >= 0 || + (*tgt != -DER_IO && *tgt != -DER_NONEXIST)) + *tgt = src; + } + break; + } + } +} + +int +dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, daos_unit_oid_t *oid, + uint32_t version, struct pool_target **p_tgt) +{ + struct pl_map *map = NULL; + struct pl_obj_layout *layout = NULL; + struct dtx_coll_target *dct; + struct daos_obj_md md = { 0 }; + int rc = 0; + int i; + + D_ASSERT(mbs != NULL); + + /* The first UPIN (and join before DTX) target is the (new) leader of the DTX. */ + for (i = 0; i < mbs->dm_tgt_cnt; i++) { + rc = ds_pool_target_status_check(pool, mbs->dm_tgts[i].ddt_id, + (uint8_t)PO_COMP_ST_UPIN, p_tgt); + if (rc < 0) + D_GOTO(out, rc); + + /* The target that (re-)joined the system after DTX cannot be the leader. */ + if (rc == 1 && (*p_tgt)->ta_comp.co_ver <= version) + D_GOTO(out, rc = 0); + } + + if (!(mbs->dm_flags & DMF_COLL_TARGET)) + D_GOTO(out, rc = -DER_NONEXIST); + + map = pl_map_find(pool->sp_uuid, oid->id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(pool->sp_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + dct = (struct dtx_coll_target *)(mbs->dm_tgts + mbs->dm_tgt_cnt); + md.omd_id = oid->id_pub; + md.omd_ver = pool->sp_map_version; + md.omd_fdom_lvl = dct->dct_fdom_lvl; + md.omd_pda = dct->dct_pda; + md.omd_pdom_lvl = dct->dct_pdom_lvl; + + rc = pl_obj_place(map, oid->id_layout_ver, &md, DAOS_OO_RW, NULL, &layout); + if (rc != 0) { + D_ERROR("Failed to load object layout for "DF_OID" in pool "DF_UUID"\n", + DP_OID(oid->id_pub), DP_UUID(pool->sp_uuid)); + goto out; + } + + for (i = 0; i < layout->ol_nr; i++) { + if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) + continue; + + rc = pool_map_find_target(map->pl_poolmap, layout->ol_shards[i].po_target, p_tgt); + D_ASSERT(rc == 1); + + /* The target that (re-)joined the system after DTX cannot be the leader. */ + if ((*p_tgt)->ta_comp.co_ver <= version) + D_GOTO(out, rc = 0); + } + + rc = -DER_NONEXIST; + +out: + if (layout != NULL) + pl_obj_layout_free(layout); + + if (map != NULL) + pl_map_decref(map); + + return rc; +} diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 36d8dee3de9..9442adf2248 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -50,22 +50,24 @@ struct dtx_cos_rec { * related object and dkey (that attached to the dtx_cos_rec). */ struct dtx_cos_rec_child { - /* Link into the container::sc_dtx_cos_list. */ - d_list_t dcrc_gl_committable; + /* Link into the container::sc_dtx_cos_list or container::sc_dtx_coll_list. */ + d_list_t dcrc_gl_committable; /* Link into related dcr_{reg,prio}_list. */ - d_list_t dcrc_lo_link; - /* The DTX identifier. */ - struct dtx_entry *dcrc_dte; + d_list_t dcrc_lo_link; + union { + struct dtx_entry *dcrc_dte; + struct dtx_coll_entry *dcrc_dce; + }; /* The DTX epoch. */ - daos_epoch_t dcrc_epoch; - /* Pointer to the dtx_cos_rec. */ - struct dtx_cos_rec *dcrc_ptr; + daos_epoch_t dcrc_epoch; + /* For non-collective DTX, it points to the dtx_cos_rec. */ + struct dtx_cos_rec *dcrc_ptr; }; struct dtx_cos_rec_bundle { - struct dtx_entry *dte; - daos_epoch_t epoch; - uint32_t flags; + void *entry; + daos_epoch_t epoch; + uint32_t flags; }; static int @@ -126,12 +128,18 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, return -DER_NOMEM; } - dcrc->dcrc_dte = dtx_entry_get(rbund->dte); dcrc->dcrc_epoch = rbund->epoch; - dcrc->dcrc_ptr = dcr; - - d_list_add_tail(&dcrc->dcrc_gl_committable, - &cont->sc_dtx_cos_list); + if (rbund->flags & DCF_COLL) { + /* Set dcrc_ptr as NULL to indicate that it is collective DTX. */ + dcrc->dcrc_ptr = NULL; + dcrc->dcrc_dce = dtx_coll_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_coll_list); + cont->sc_dtx_committable_coll_count++; + } else { + dcrc->dcrc_ptr = dcr; + dcrc->dcrc_dte = dtx_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_cos_list); + } cont->sc_dtx_committable_count++; d_tm_inc_gauge(tls->dt_committable, 1); @@ -159,6 +167,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) struct dtx_cos_rec_child *dcrc; struct dtx_cos_rec_child *next; int dec = 0; + int coll = 0; struct dtx_tls *tls = dtx_tls_get(); D_ASSERT(tins->ti_umm.umm_id == UMEM_CLASS_VMEM); @@ -168,7 +177,12 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } @@ -176,7 +190,12 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } @@ -184,13 +203,19 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + coll++; + } D_FREE(dcrc); dec++; } D_FREE(dcr); cont->sc_dtx_committable_count -= dec; + cont->sc_dtx_committable_coll_count -= coll; /** adjust per-pool counter */ d_tm_dec_gauge(tls->dt_committable, dec); @@ -231,12 +256,18 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, if (dcrc == NULL) return -DER_NOMEM; - dcrc->dcrc_dte = dtx_entry_get(rbund->dte); dcrc->dcrc_epoch = rbund->epoch; - dcrc->dcrc_ptr = dcr; - - d_list_add_tail(&dcrc->dcrc_gl_committable, - &cont->sc_dtx_cos_list); + if (rbund->flags & DCF_COLL) { + /* Set dcrc_ptr as NULL to indicate that it is collective DTX. */ + dcrc->dcrc_ptr = NULL; + dcrc->dcrc_dce = dtx_coll_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_coll_list); + cont->sc_dtx_committable_coll_count++; + } else { + dcrc->dcrc_ptr = dcr; + dcrc->dcrc_dte = dtx_entry_get(rbund->entry); + d_list_add_tail(&dcrc->dcrc_gl_committable, &cont->sc_dtx_cos_list); + } cont->sc_dtx_committable_count++; d_tm_inc_gauge(tls->dt_committable, 1); @@ -267,7 +298,8 @@ btr_ops_t dtx_btr_cos_ops = { int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, - struct dtx_entry ***dtes, struct dtx_cos_key **dcks) + struct dtx_entry ***dtes, struct dtx_cos_key **dcks, + struct dtx_coll_entry **p_dce) { struct dtx_entry **dte_buf = NULL; struct dtx_cos_key *dck_buf = NULL; @@ -275,6 +307,23 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, uint32_t count; uint32_t i = 0; + if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) { + d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) { + if (epoch >= dcrc->dcrc_epoch) { + D_ALLOC_PTR(dck_buf); + if (dck_buf == NULL) + return -DER_NOMEM; + + dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + + return 1; + } + } + } + count = min(cont->sc_dtx_committable_count, max_cnt); if (count == 0) { *dtes = NULL; @@ -300,9 +349,21 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, if (epoch < dcrc->dcrc_epoch) continue; - dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + + if (unlikely(oid != NULL && dcrc->dcrc_ptr == NULL)) { + if (i > 0) + continue; + + D_FREE(dte_buf); + *dcks = dck_buf; + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + + return 1; + } + + dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); if (++i >= count) break; } @@ -373,9 +434,8 @@ dtx_list_cos(struct ds_cont_child *cont, daos_unit_oid_t *oid, } int -dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_unit_oid_t *oid, uint64_t dkey_hash, - daos_epoch_t epoch, uint32_t flags) +dtx_add_cos(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, + uint64_t dkey_hash, daos_epoch_t epoch, uint32_t flags) { struct dtx_cos_key key; struct dtx_cos_rec_bundle rbund; @@ -386,14 +446,13 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, if (!dtx_cont_opened(cont)) return -DER_SHUTDOWN; - D_ASSERT(dte->dte_mbs != NULL); D_ASSERT(epoch != DAOS_EPOCH_MAX); key.oid = *oid; key.dkey_hash = dkey_hash; d_iov_set(&kiov, &key, sizeof(key)); - rbund.dte = dte; + rbund.entry = entry; rbund.epoch = epoch; rbund.flags = flags; d_iov_set(&riov, &rbund, sizeof(rbund)); @@ -401,10 +460,16 @@ dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, rc = dbtree_upsert(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert DTX "DF_DTI" to CoS " - "cache, "DF_UOID", key %lu, flags %x: rc = "DF_RC"\n", - DP_DTI(&dte->dte_xid), DP_UOID(*oid), (unsigned long)dkey_hash, - flags, DP_RC(rc)); + if (flags & DCF_COLL) + D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, " + DF_UOID", key %lu, flags %x: "DF_RC"\n", + DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid), + (unsigned long)dkey_hash, flags, DP_RC(rc)); + else + D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, " + DF_UOID", key %lu, flags %x: "DF_RC"\n", + DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid), + (unsigned long)dkey_hash, flags, DP_RC(rc)); return rc; } @@ -413,7 +478,6 @@ int dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash) { - struct dtx_tls *tls = dtx_tls_get(); struct dtx_cos_key key; d_iov_t kiov; d_iov_t riov; @@ -439,12 +503,16 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_prio_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 1); } @@ -455,12 +523,16 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_reg_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 2); } @@ -471,21 +543,28 @@ dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, d_list_del(&dcrc->dcrc_gl_committable); d_list_del(&dcrc->dcrc_lo_link); - dtx_entry_put(dcrc->dcrc_dte); + if (dcrc->dcrc_ptr != NULL) { + dtx_entry_put(dcrc->dcrc_dte); + } else { + dtx_coll_entry_put(dcrc->dcrc_dce); + cont->sc_dtx_committable_coll_count--; + } D_FREE(dcrc); cont->sc_dtx_committable_count--; dcr->dcr_expcmt_count--; - d_tm_dec_gauge(tls->dt_committable, 1); D_GOTO(out, found = 3); } out: - if (found > 0 && dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && - dcr->dcr_expcmt_count == 0) - rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, - &kiov, NULL); + if (found > 0) { + d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1); + + if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && + dcr->dcr_expcmt_count == 0) + rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); + } if (rc == 0 && found == 0) rc = -DER_NONEXIST; diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a38c747a61d..134782a8b90 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -22,16 +22,26 @@ * These are for daos_rpc::dr_opc and DAOS_RPC_OPCODE(opc, ...) rather than * crt_req_create(..., opc, ...). See src/include/daos/rpc.h. */ -#define DAOS_DTX_VERSION 3 +#define DAOS_DTX_VERSION 4 /* LIST of internal RPCS in form of: * OPCODE, flags, FMT, handler, corpc_hdlr, */ -#define DTX_PROTO_SRV_RPC_LIST \ - X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_commit") \ - X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, NULL, "dtx_abort") \ - X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, NULL, "dtx_check") \ - X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, NULL, "dtx_refresh") +#define DTX_PROTO_SRV_RPC_LIST \ + X(DTX_COMMIT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_commit") \ + X(DTX_ABORT, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_abort") \ + X(DTX_CHECK, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_check") \ + X(DTX_REFRESH, 0, &CQF_dtx, dtx_handler, \ + NULL, "dtx_refresh") \ + X(DTX_COLL_COMMIT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_commit_co_ops, "dtx_coll_commit") \ + X(DTX_COLL_ABORT, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_abort_co_ops, "dtx_coll_abort") \ + X(DTX_COLL_CHECK, 0, &CQF_dtx_coll, dtx_coll_handler, \ + &dtx_coll_check_co_ops, "dtx_coll_check") #define X(a, b, c, d, e, f) a, enum dtx_operation { @@ -56,6 +66,27 @@ enum dtx_operation { CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +/* + * DTX collective RPC input fields + * dci_hints is sparse array, one per engine, sorted against the rank ID. + * It can hold more than 19K engines inline RPC body. + */ +#define DAOS_ISEQ_COLL_DTX \ + ((uuid_t) (dci_po_uuid) CRT_VAR) \ + ((uuid_t) (dci_co_uuid) CRT_VAR) \ + ((struct dtx_id) (dci_xid) CRT_VAR) \ + ((uint32_t) (dci_version) CRT_VAR) \ + ((uint32_t) (dci_padding) CRT_VAR) \ + ((uint64_t) (dci_epoch) CRT_VAR) \ + ((uint8_t) (dci_hints) CRT_ARRAY) + +/* DTX collective RPC output fields */ +#define DAOS_OSEQ_COLL_DTX \ + ((int32_t) (dco_status) CRT_VAR) \ + ((uint32_t) (dco_misc) CRT_VAR) + +CRT_RPC_DECLARE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); + #define DTX_YIELD_CYCLE (DTX_THRESHOLD_COUNT >> 3) /* The time threshold for triggering DTX cleanup of stale entries. @@ -149,6 +180,20 @@ extern uint32_t dtx_batched_ult_max; */ #define DTX_INLINE_MBS_SIZE 512 +#define DTX_COLL_TREE_WIDTH 16 + +extern struct crt_corpc_ops dtx_coll_commit_co_ops; +extern struct crt_corpc_ops dtx_coll_abort_co_ops; +extern struct crt_corpc_ops dtx_coll_check_co_ops; + +struct dtx_coll_prep_args { + struct dtx_coll_entry *dcpa_dce; + crt_rpc_t *dcpa_rpc; + daos_unit_oid_t dcpa_oid; + ABT_future dcpa_future; + int dcpa_result; +}; + struct dtx_pool_metrics { struct d_tm_node_t *dpm_batched_degree; struct d_tm_node_t *dpm_batched_total; @@ -161,7 +206,6 @@ struct dtx_pool_metrics { struct dtx_tls { struct d_tm_node_t *dt_committable; struct d_tm_node_t *dt_dtx_leader_total; - struct d_tm_node_t *dt_dtx_entry_total; uint64_t dt_agg_gen; uint32_t dt_batched_ult_cnt; }; @@ -196,31 +240,37 @@ void dtx_batched_commit(void *arg); void dtx_aggregation_main(void *arg); int start_dtx_reindex_ult(struct ds_cont_child *cont); void stop_dtx_reindex_ult(struct ds_cont_child *cont); +void dtx_merge_check_result(int *tgt, int src); +int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, + daos_unit_oid_t *oid, uint32_t version, struct pool_target **p_tgt); /* dtx_cos.c */ int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, - struct dtx_entry ***dtes, struct dtx_cos_key **dcks); -int dtx_add_cos(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_unit_oid_t *oid, uint64_t dkey_hash, - daos_epoch_t epoch, uint32_t flags); + struct dtx_entry ***dtes, struct dtx_cos_key **dcks, + struct dtx_coll_entry **p_dce); +int dtx_add_cos(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, + uint64_t dkey_hash, daos_epoch_t epoch, uint32_t flags); int dtx_del_cos(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash); uint64_t dtx_cos_oldest(struct ds_cont_child *cont); /* dtx_rpc.c */ -int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); - +int dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); int dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *check_list, d_list_t *cmt_list, d_list_t *abt_list, d_list_t *act_list, bool for_io); -int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_epoch_t epoch, int *tgt_array, int *err); +int dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, + uint64_t dkey_hash, daos_epoch_t epoch, int *tgt_array, int *err); -int dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, - struct pool_target **p_tgt); +/* dtx_coll.c */ +void dtx_coll_prep_ult(void *arg); +int dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, + struct dtx_memberships *mbs, uint32_t my_tgtid, uint32_t dtx_ver, + uint32_t pm_ver, bool for_check, bool need_hint, struct dtx_coll_entry **p_dce); +int dtx_coll_local_exec(uuid_t po_uuid, uuid_t co_uuid, struct dtx_id *xid, daos_epoch_t epoch, + uint32_t opc, uint32_t bitmap_sz, uint8_t *bitmap, int **p_results); enum dtx_status_handle_result { DSHR_NEED_COMMIT = 1, @@ -234,4 +284,15 @@ enum dtx_rpc_flags { DRF_INITIAL_LEADER = (1 << 0), }; +enum dtx_cos_flags { + DCF_SHARED = (1 << 0), + /* Some DTX (such as for the distributed transaction across multiple + * RDGs, or for EC object modification) need to be committed via DTX + * RPC instead of piggyback via other dispatched update/punch RPC. + */ + DCF_EXP_CMT = (1 << 1), + /* For collective DTX. */ + DCF_COLL = (1 << 2), +}; + #endif /* __DTX_INTERNAL_H__ */ diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 02f94319c6a..4a7661a5167 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -138,55 +138,22 @@ dtx_resync_commit(struct ds_cont_child *cont, return rc; } -/* Get leader from dtx */ -int -dtx_leader_get(struct ds_pool *pool, struct dtx_memberships *mbs, struct pool_target **p_tgt) -{ - int i; - int rc = 0; - - D_ASSERT(mbs != NULL); - /* The first UPIN target is the leader of the DTX */ - for (i = 0; i < mbs->dm_tgt_cnt; i++) { - rc = ds_pool_target_status_check(pool, mbs->dm_tgts[i].ddt_id, - (uint8_t)PO_COMP_ST_UPIN, p_tgt); - if (rc < 0) - D_GOTO(out, rc); - - if (rc == 1) { - rc = 0; - break; - } - } - - if (i == mbs->dm_tgt_cnt) - rc = -DER_NONEXIST; -out: - return rc; -} - static int dtx_is_leader(struct ds_pool *pool, struct dtx_resync_args *dra, struct dtx_resync_entry *dre) { struct dtx_memberships *mbs = dre->dre_dte.dte_mbs; struct pool_target *target = NULL; - d_rank_t myrank; int rc; if (mbs == NULL) return 1; - rc = dtx_leader_get(pool, mbs, &target); - if (rc < 0) - D_GOTO(out, rc); - - D_ASSERT(target != NULL); - rc = crt_group_rank(NULL, &myrank); + rc = dtx_leader_get(pool, mbs, &dre->dre_oid, dre->dre_dte.dte_ver, &target); if (rc < 0) D_GOTO(out, rc); - if (myrank != target->ta_comp.co_rank || + if (dss_self_rank() != target->ta_comp.co_rank || dss_get_module_info()->dmi_tgt_id != target->ta_comp.co_index) return 0; @@ -261,28 +228,41 @@ dtx_verify_groups(struct ds_pool *pool, struct dtx_memberships *mbs, } int -dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, - daos_epoch_t epoch, int *tgt_array, int *err) +dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_unit_oid_t oid, + uint64_t dkey_hash, daos_epoch_t epoch, int *tgt_array, int *err) { - int rc = 0; + struct dtx_memberships *mbs = dte->dte_mbs; + struct dtx_coll_entry *dce = NULL; + int rc = 0; + + if (mbs->dm_flags & DMF_COLL_TARGET) { + rc = dtx_coll_prep(cont->sc_pool_uuid, oid, &dte->dte_xid, mbs, + dss_get_module_info()->dmi_tgt_id, dte->dte_ver, + cont->sc_pool->spc_map_version, true, true, &dce); + if (rc != 0) { + D_ERROR("Failed to prepare the bitmap (and hints) for collective DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); + goto out; + } - rc = dtx_check(cont, dte, epoch); + rc = dtx_coll_check(cont, dce, epoch); + } else { + rc = dtx_check(cont, dte, epoch); + } switch (rc) { case DTX_ST_COMMITTED: case DTX_ST_COMMITTABLE: /* The DTX has been committed on some remote replica(s), * let's commit the DTX globally. */ - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); case -DER_INPROGRESS: case -DER_TIMEDOUT: D_WARN("Other participants not sure about whether the " "DTX "DF_DTI" is committed or not, need retry.\n", DP_DTI(&dte->dte_xid)); - return DSHR_NEED_RETRY; + D_GOTO(out, rc = DSHR_NEED_RETRY); case DTX_ST_PREPARED: { - struct dtx_memberships *mbs = dte->dte_mbs; - /* If the transaction across multiple redundancy groups, * need to check whether there are enough alive targets. */ @@ -293,7 +273,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, goto out; if (rc > 0) - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); /* XXX: For the distributed transaction that lose too * many particiants (the whole redundancy group), @@ -304,14 +284,17 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * Then we mark the TX as corrupted via special * dtx_abort() with 0 @epoch. */ - rc = dtx_abort(cont, dte, 0); + if (mbs->dm_flags & DMF_COLL_TARGET) + rc = dtx_coll_abort(cont, dce, 0); + else + rc = dtx_abort(cont, dte, 0); if (rc < 0 && err != NULL) *err = rc; - return DSHR_CORRUPT; + D_GOTO(out, rc = DSHR_CORRUPT); } - return DSHR_NEED_COMMIT; + D_GOTO(out, rc = DSHR_NEED_COMMIT); } case -DER_NONEXIST: /* Someone (the DTX owner or batched commit ULT) may have @@ -345,7 +328,10 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, * some other DTX(s). To avoid complex rollback logic, let's * abort the DTXs one by one, not batched. */ - rc = dtx_abort(cont, dte, epoch); + if (mbs->dm_flags & DMF_COLL_TARGET) + rc = dtx_coll_abort(cont, dce, epoch); + else + rc = dtx_abort(cont, dte, epoch); D_DEBUG(DB_TRACE, "As new leader for DTX "DF_DTI", abort it (2): "DF_RC"\n", DP_DTI(&dte->dte_xid), DP_RC(rc)); @@ -354,10 +340,10 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, if (err != NULL) *err = rc; - return DSHR_ABORT_FAILED; + D_GOTO(out, rc = DSHR_ABORT_FAILED); } - return DSHR_IGNORE; + D_GOTO(out, rc = DSHR_IGNORE); default: D_WARN("Not sure about whether the DTX "DF_DTI " can be committed or not: %d, skip it.\n", @@ -368,6 +354,15 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, } out: + if (rc == DSHR_NEED_COMMIT && mbs->dm_flags & DMF_COLL_TARGET) { + struct dtx_cos_key dck; + + dck.oid = oid; + dck.dkey_hash = dkey_hash; + rc = dtx_coll_commit(cont, dce, &dck); + } + + dtx_coll_entry_put(dce); return rc; } @@ -412,9 +407,10 @@ dtx_status_handle(struct dtx_resync_args *dra) } if (dre->dre_dte.dte_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, &dre->dre_dte.dte_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dre->dre_xid, NULL, + &dre->dre_dte.dte_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST) + if (rc < 0 && rc != -DER_NONEXIST) D_WARN("Failed to load mbs, do not know the leader for DTX " DF_DTI" (ver = %u/%u/%u): rc = %d, skip it.\n", DP_DTI(&dre->dre_xid), dra->resync_version, @@ -446,8 +442,8 @@ dtx_status_handle(struct dtx_resync_args *dra) continue; } - rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_epoch, - tgt_array, &err); + rc = dtx_status_handle_one(cont, &dre->dre_dte, dre->dre_oid, dre->dre_dkey_hash, + dre->dre_epoch, tgt_array, &err); switch (rc) { case DSHR_NEED_COMMIT: goto commit; diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5c4c44c9035..fdcda4abd3f 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -20,6 +20,7 @@ #include "dtx_internal.h" CRT_RPC_DEFINE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); +CRT_RPC_DEFINE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); #define X(a, b, c, d, e, f) \ { \ @@ -206,18 +207,16 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: + D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + drr->drr_comp = 1; drr->drr_result = rc; rc = ABT_future_set(dra->dra_future, drr); D_ASSERTF(rc == ABT_SUCCESS, "ABT_future_set failed for opc %x to %d/%d: rc = %d.\n", dra->dra_opc, drr->drr_rank, drr->drr_tag, rc); - - D_DEBUG(DB_TRACE, - "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", rc %d.\n", dra->dra_opc, req, - dra->dra_future, drr->drr_rank, drr->drr_tag, - din != NULL ? din->di_epoch : 0, drr->drr_result); } static int @@ -291,41 +290,7 @@ dtx_req_list_cb(void **args) if (dra->dra_opc == DTX_CHECK) { for (i = 0; i < dra->dra_length; i++) { drr = args[i]; - switch (drr->drr_result) { - case DTX_ST_COMMITTED: - case DTX_ST_COMMITTABLE: - dra->dra_result = DTX_ST_COMMITTED; - /* As long as one target has committed the DTX, - * then the DTX is committable on all targets. - */ - D_DEBUG(DB_TRACE, - "The DTX "DF_DTI" has been committed on %d/%d.\n", - DP_DTI(&drr->drr_dti[0]), drr->drr_rank, drr->drr_tag); - return; - case -DER_EXCLUDED: - /* - * If non-leader is excluded, handle it as 'prepared'. If other - * non-leaders are also 'prepared' then related DTX maybe still - * committable or 'corrupted'. The subsequent DTX resync logic - * will handle related things, see dtx_verify_groups(). - * - * Fall through. - */ - case DTX_ST_PREPARED: - if (dra->dra_result == 0 || - dra->dra_result == DTX_ST_CORRUPTED) - dra->dra_result = DTX_ST_PREPARED; - break; - case DTX_ST_CORRUPTED: - if (dra->dra_result == 0) - dra->dra_result = drr->drr_result; - break; - default: - dra->dra_result = drr->drr_result >= 0 ? - -DER_IO : drr->drr_result; - break; - } - + dtx_merge_check_result(&dra->dra_result, drr->drr_result); D_DEBUG(DB_TRACE, "The DTX "DF_DTI" RPC req result %d, status is %d.\n", DP_DTI(&drr->drr_dti[0]), drr->drr_result, dra->dra_result); } @@ -608,7 +573,7 @@ dtx_rpc_internal(struct dtx_common_args *dca) int rc; int i; - if (dca->dca_dra.dra_opc != DTX_REFRESH) { + if (dca->dca_dtes != NULL) { D_ASSERT(dca->dca_dtis != NULL); if (dca->dca_count > 1) { @@ -778,7 +743,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, * Some RPC may has been sent, so need to wait even if dtx_rpc_prep hit failure. */ rc = dtx_rpc_post(&dca, rc, false); - if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED) + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) rc = 0; if (rc != 0) { @@ -833,7 +798,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, DP_DTI(&dtes[0]->dte_xid), count, dra->dra_committed > 0 ? "partial" : "nothing", rc, rc1); else - D_DEBUG(DB_IO, "Commit DTXs " DF_DTI", count %d\n", + D_DEBUG(DB_TRACE, "Commit DTXs " DF_DTI", count %d\n", DP_DTI(&dtes[0]->dte_xid), count); return rc != 0 ? rc : rc1; @@ -870,7 +835,7 @@ dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) if (rc1 > 0 || rc1 == -DER_NONEXIST) rc1 = 0; - D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_IO, "Abort DTX "DF_DTI": rc %d %d %d\n", + D_CDEBUG(rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, "Abort DTX "DF_DTI": rc %d %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1, rc2); return rc1 != 0 ? rc1 : rc2; @@ -893,8 +858,8 @@ dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch) rc1 = dtx_rpc_post(&dca, rc, false); - D_CDEBUG(rc1 < 0, DLOG_ERR, DB_IO, "Check DTX "DF_DTI": rc %d %d\n", - DP_DTI(&dte->dte_xid), rc, rc1); + D_CDEBUG(rc1 < 0 && rc1 != -DER_NONEXIST, DLOG_ERR, DB_TRACE, + "Check DTX "DF_DTI": rc %d %d\n", DP_DTI(&dte->dte_xid), rc, rc1); return rc1; } @@ -929,9 +894,9 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che drop = false; if (dsp->dsp_mbs == NULL) { - rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, &dsp->dsp_mbs); + rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) goto out; drop = true; @@ -940,7 +905,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che } again: - rc = dtx_leader_get(pool, dsp->dsp_mbs, &target); + rc = dtx_leader_get(pool, dsp->dsp_mbs, &dsp->dsp_oid, dsp->dsp_version, &target); if (rc < 0) { /** * Currently, for EC object, if parity node is @@ -1166,8 +1131,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che dte.dte_refs = 1; dte.dte_mbs = dsp->dsp_mbs; - rc = dtx_status_handle_one(cont, &dte, dsp->dsp_epoch, - NULL, NULL); + rc = dtx_status_handle_one(cont, &dte, dsp->dsp_oid, dsp->dsp_dkey_hash, + dsp->dsp_epoch, NULL, NULL); switch (rc) { case DSHR_NEED_COMMIT: { struct dtx_entry *pdte = &dte; @@ -1187,6 +1152,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (for_io) D_GOTO(out, rc = -DER_INPROGRESS); continue; + case 0: case DSHR_IGNORE: dtx_dsp_free(dsp); continue; @@ -1297,3 +1263,367 @@ dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont) return rc; } + +static int +dtx_coll_commit_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + out_target->dco_misc += out_source->dco_misc; + if (out_target->dco_status == 0) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_abort_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + if (out_source->dco_status != 0 && + (out_target->dco_status == 0 || out_target->dco_status == -DER_NONEXIST)) + out_target->dco_status = out_source->dco_status; + + return 0; +} + +static int +dtx_coll_check_aggregator(crt_rpc_t *source, crt_rpc_t *target, void *priv) +{ + struct dtx_coll_out *out_source = crt_reply_get(source); + struct dtx_coll_out *out_target = crt_reply_get(target); + + dtx_merge_check_result(&out_target->dco_status, out_source->dco_status); + + return 0; +} + +struct crt_corpc_ops dtx_coll_commit_co_ops = { + .co_aggregate = dtx_coll_commit_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_abort_co_ops = { + .co_aggregate = dtx_coll_abort_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct crt_corpc_ops dtx_coll_check_co_ops = { + .co_aggregate = dtx_coll_check_aggregator, + .co_pre_forward = NULL, + .co_post_reply = NULL, +}; + +struct dtx_coll_rpc_args { + struct ds_cont_child *dcra_cont; + struct dtx_id dcra_xid; + uint32_t dcra_opc; + uint32_t dcra_ver; + daos_epoch_t dcra_epoch; + d_rank_list_t *dcra_ranks; + uint8_t *dcra_hints; + uint32_t dcra_hint_sz; + uint32_t dcra_committed; + uint32_t dcra_completed:1; + int dcra_result; + ABT_thread dcra_helper; + ABT_future dcra_future; +}; + +static void +dtx_coll_rpc_cb(const struct crt_cb_info *cb_info) +{ + struct dtx_coll_rpc_args *dcra = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct dtx_coll_out *dco; + int rc = cb_info->cci_rc; + + if (rc != 0) { + dcra->dcra_result = rc; + } else { + dco = crt_reply_get(req); + dcra->dcra_result = dco->dco_status; + dcra->dcra_committed = dco->dco_misc; + } + + dcra->dcra_completed = 1; + rc = ABT_future_set(dcra->dcra_future, NULL); + D_ASSERTF(rc == ABT_SUCCESS, + "ABT_future_set failed for opc %u: rc = %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc(struct dtx_coll_rpc_args *dcra) +{ + crt_rpc_t *req = NULL; + struct dtx_coll_in *dci; + int rc; + + rc = ABT_future_create(1, NULL, &dcra->dcra_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed for coll DTX ("DF_DTI") RPC %u: rc = %d\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = crt_corpc_req_create(dss_get_module_info()->dmi_ctx, NULL, dcra->dcra_ranks, + DAOS_RPC_OPCODE(dcra->dcra_opc, DAOS_DTX_MODULE, + DAOS_DTX_VERSION), + NULL, NULL, CRT_RPC_FLAG_FILTER_INVERT, + crt_tree_topo(CRT_TREE_KNOMIAL, DTX_COLL_TREE_WIDTH), &req); + if (rc != 0) { + D_ERROR("crt_corpc_req_create failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + D_GOTO(out, rc); + } + + dci = crt_req_get(req); + + uuid_copy(dci->dci_po_uuid, dcra->dcra_cont->sc_pool_uuid); + uuid_copy(dci->dci_co_uuid, dcra->dcra_cont->sc_uuid); + dci->dci_xid = dcra->dcra_xid; + dci->dci_version = dcra->dcra_ver; + dci->dci_epoch = dcra->dcra_epoch; + dci->dci_hints.ca_count = dcra->dcra_hint_sz; + dci->dci_hints.ca_arrays = dcra->dcra_hints; + + rc = crt_req_send(req, dtx_coll_rpc_cb, dcra); + if (rc != 0) + D_ERROR("crt_req_send failed for coll DTX ("DF_DTI") RPC %u: "DF_RC"\n", + DP_DTI(&dcra->dcra_xid), dcra->dcra_opc, DP_RC(rc)); + +out: + if (rc != 0 && !dcra->dcra_completed) { + dcra->dcra_result = rc; + dcra->dcra_completed = 1; + if (dcra->dcra_future != ABT_FUTURE_NULL) + ABT_future_set(dcra->dcra_future, NULL); + } + + return rc; +} + +static void +dtx_coll_rpc_helper(void *arg) +{ + struct dtx_coll_rpc_args *dcra = arg; + int rc; + + rc = dtx_coll_rpc(dcra); + + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Collective DTX helper ULT for %u exit: %d\n", dcra->dcra_opc, rc); +} + +static int +dtx_coll_rpc_prep(struct ds_cont_child *cont, struct dtx_coll_entry *dce, uint32_t opc, + daos_epoch_t epoch, struct dtx_coll_rpc_args *dcra) +{ + int rc; + + dcra->dcra_cont = cont; + dcra->dcra_xid = dce->dce_xid; + dcra->dcra_opc = opc; + dcra->dcra_ver = dce->dce_ver; + dcra->dcra_epoch = epoch; + dcra->dcra_ranks = dce->dce_ranks; + dcra->dcra_hints = dce->dce_hints; + dcra->dcra_hint_sz = dce->dce_hint_sz; + dcra->dcra_future = ABT_FUTURE_NULL; + dcra->dcra_helper = ABT_THREAD_NULL; + + if (dss_has_enough_helper()) + rc = dss_ult_create(dtx_coll_rpc_helper, dcra, DSS_XS_IOFW, + dss_get_module_info()->dmi_tgt_id, 0, &dcra->dcra_helper); + else + rc = dtx_coll_rpc(dcra); + + return rc; +} + +static int +dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) +{ + int rc; + + if (dcra->dcra_helper != ABT_THREAD_NULL) + ABT_thread_free(&dcra->dcra_helper); + + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } + + return ret != 0 ? ret : dcra->dcra_result; +} + +int +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + uint32_t committed = 0; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, + DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] > 0) + committed += results[i]; + else if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + + committed += dcra.dcra_committed; + } + + if (rc == 0 && rc1 == 0) + rc2 = vos_dtx_commit(cont->sc_hdl, &dce->dce_xid, 1, NULL); + else if (committed > 0) + /* Mark the DTX as "PARTIAL_COMMITTED" and re-commit it later via cleanup logic. */ + rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_PARTIAL_COMMITTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + /* + * NOTE: Currently, we commit collective DTX one by one with high priority. So here we have + * to remove the collective DTX entry from the CoS even if the commit failed remotely. + * Otherwise, the batched commit ULT may be blocked by such "bad" entry. + */ + if (rc2 == 0 && dck != NULL) + dtx_del_cos(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively commit DTX "DF_DTI": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int rc2 = 0; + int i; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, + DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (results[i] < 0 && results[i] != -DER_NONEXIST && rc1 == 0) + rc1 = results[i]; + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (rc > 0 || rc == -DER_NONEXIST || rc == -DER_EXCLUDED || rc == -DER_OOG) + rc = 0; + } + + if (epoch != 0) + rc2 = vos_dtx_abort(cont->sc_hdl, &dce->dce_xid, epoch); + else + rc2 = vos_dtx_set_flags(cont->sc_hdl, &dce->dce_xid, 1, DTE_CORRUPTED); + if (rc2 > 0 || rc2 == -DER_NONEXIST) + rc2 = 0; + + D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, + "Collectively abort DTX "DF_DTI": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), rc, rc1, rc2); + + return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; +} + +int +dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch) +{ + struct dtx_coll_rpc_args dcra = { 0 }; + int *results = NULL; + int len; + int rc = 0; + int rc1 = 0; + int i; + + /* + * If no other target, then current target is the unique + * one and 'prepared', then related DTX can be committed. + */ + if (unlikely(dce->dce_ranks == NULL && dce->dce_bitmap == NULL)) + return DTX_ST_PREPARED; + + if (dce->dce_ranks != NULL) + rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_CHECK, epoch, &dcra); + + if (dce->dce_bitmap != NULL) { + len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, + DTX_COLL_CHECK, dce->dce_bitmap_sz, dce->dce_bitmap, + &results); + if (len < 0) { + rc1 = len; + } else { + D_ASSERT(results != NULL); + for (i = 0; i < len; i++) { + if (isset(dce->dce_bitmap, i)) + dtx_merge_check_result(&rc1, results[i]); + } + } + D_FREE(results); + } + + if (dce->dce_ranks != NULL) { + rc = dtx_coll_rpc_post(&dcra, rc); + if (dce->dce_bitmap != NULL) + dtx_merge_check_result(&rc, rc1); + } + + D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, + DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", + DP_DTI(&dce->dce_xid), rc, rc1); + + return dce->dce_ranks != NULL ? rc : rc1; +} diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 9ea25a9dcd0..b3ed86df76a 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -47,14 +47,6 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create DTX leader metric: " DF_RC"\n", DP_RC(rc)); - rc = d_tm_add_metric(&tls->dt_dtx_entry_total, D_TM_GAUGE, - "total number of dtx entry in cache", "entry", - "mem/dtx/dtx_entry_%u/tgt_%u", - sizeof(struct dtx_entry), tgt_id); - if (rc != DER_SUCCESS) - D_WARN("Failed to create DTX entry metric: " DF_RC"\n", - DP_RC(rc)); - return tls; } @@ -247,7 +239,7 @@ dtx_handler(crt_rpc_t *rpc) rc1 = start_dtx_reindex_ult(cont); if (rc1 != 0) D_ERROR(DF_UUID": Failed to trigger DTX reindex: "DF_RC"\n", - DP_UUID(cont->sc_uuid), DP_RC(rc)); + DP_UUID(cont->sc_uuid), DP_RC(rc1)); } break; @@ -341,9 +333,14 @@ dtx_handler(crt_rpc_t *rpc) if (mbs[i] == NULL) continue; + /* For collective DTX, it will be committed soon. */ + if (mbs[i]->dm_flags & DMF_COLL_TARGET) { + D_FREE(mbs[i]); + continue; + } + daos_dti_copy(&dtes[j].dte_xid, - (struct dtx_id *) - din->di_dtx_array.ca_arrays + i); + (struct dtx_id *)din->di_dtx_array.ca_arrays + i); dtes[j].dte_ver = vers[i]; dtes[j].dte_refs = 1; dtes[j].dte_mbs = mbs[i]; @@ -353,19 +350,19 @@ dtx_handler(crt_rpc_t *rpc) j++; } - D_ASSERT(j == rc1); + if (j > 0) { + /* + * Commit the DTX after replied the original refresh request to + * avoid further query the same DTX. + */ + rc = dtx_commit(cont, pdte, dcks, j); + if (rc < 0) + D_WARN("Failed to commit DTX "DF_DTI", count %d: " + DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, DP_RC(rc)); - /* Commit the DTX after replied the original refresh request to - * avoid further query the same DTX. - */ - rc = dtx_commit(cont, pdte, dcks, j); - if (rc < 0) - D_WARN("Failed to commit DTX "DF_DTI", count %d: " - DF_RC"\n", DP_DTI(&dtes[0].dte_xid), j, - DP_RC(rc)); - - for (i = 0; i < j; i++) - D_FREE(pdte[i]->dte_mbs); + for (i = 0; i < j; i++) + D_FREE(pdte[i]->dte_mbs); + } } D_FREE(dout->do_sub_rets.ca_arrays); @@ -375,13 +372,147 @@ dtx_handler(crt_rpc_t *rpc) ds_cont_child_put(cont); } +static void +dtx_coll_handler(crt_rpc_t *rpc) +{ + struct dtx_coll_in *dci = crt_req_get(rpc); + struct dtx_coll_out *dco = crt_reply_get(rpc); + struct dtx_coll_prep_args dcpa = { 0 }; + d_rank_t myrank = dss_self_rank(); + uint32_t bitmap_sz = 0; + uint32_t opc = opc_get(rpc->cr_opc); + uint8_t *hints = dci->dci_hints.ca_arrays; + uint8_t *bitmap = NULL; + int *results = NULL; + bool force_check = false; + int len; + int rc; + int i; + + D_ASSERT(hints != NULL); + D_ASSERT(dci->dci_hints.ca_count > myrank); + + D_DEBUG(DB_TRACE, "Handling collective DTX PRC %u on rank %d for "DF_DTI" with hint %d\n", + opc, myrank, DP_DTI(&dci->dci_xid), (int)hints[myrank]); + + dcpa.dcpa_rpc = rpc; + rc = ABT_future_create(1, NULL, &dcpa.dcpa_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed: rc = %d\n", rc); + D_GOTO(out, rc = dss_abterr2der(rc)); + } + + rc = dss_ult_create(dtx_coll_prep_ult, &dcpa, DSS_XS_VOS, hints[myrank], 0, NULL); + if (rc != 0) { + ABT_future_free(&dcpa.dcpa_future); + D_ERROR("Failed to create ult on XS %u: "DF_RC"\n", hints[myrank], DP_RC(rc)); + goto out; + } + + rc = ABT_future_wait(dcpa.dcpa_future); + D_ASSERT(rc == ABT_SUCCESS); + + ABT_future_free(&dcpa.dcpa_future); + + switch (dcpa.dcpa_result) { + case 0: + D_ASSERT(dcpa.dcpa_dce != NULL); + + if (unlikely(dcpa.dcpa_dce->dce_bitmap == NULL)) + /* + * For DTX check, if all local shards are either migrated or + * not suitable for check, then assume that they are prepared. + * For other cases, DTX commit or abort, the bitmap should not + * be empty, so there must be some data corruption if empty. + */ + D_GOTO(out, rc = (opc == DTX_COLL_CHECK) ? DTX_ST_PREPARED : -DER_IO); + + bitmap = dcpa.dcpa_dce->dce_bitmap; + bitmap_sz = dcpa.dcpa_dce->dce_bitmap_sz; + break; + case 1: + /* The DTX has been committed, then depends on the RPC type. */ + if (opc == DTX_COLL_ABORT) { + D_ERROR("NOT allow to abort committed DTX "DF_DTI"\n", + DP_DTI(&dci->dci_xid)); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (opc == DTX_COLL_CHECK) + D_GOTO(out, rc = DTX_ST_COMMITTED); + + D_ASSERT(opc == DTX_COLL_COMMIT); + /* + * We do not know whether the DTX on the other VOS targets has been committed + * or not, let's continue the commit on the other local VOS targets by force. + */ + break; + case -DER_INPROGRESS: + /* Fall through. */ + case -DER_NONEXIST: + /* The shard on the hint VOS target may not exist, then depends on the RPC type. */ + if (opc == DTX_COLL_CHECK) + force_check = true; + /* + * It is unknown whether the DTX on the other VOS targets has been committed/aborted + * or not, let's continue related operation on the other local VOS targets by force. + */ + break; + default: + D_ASSERTF(dcpa.dcpa_result < 0, "Unexpected result when load MBS for DTX " + DF_DTI": "DF_RC"\n", DP_DTI(&dci->dci_xid), DP_RC(dcpa.dcpa_result)); + D_GOTO(out, rc = dcpa.dcpa_result); + } + + len = dtx_coll_local_exec(dci->dci_po_uuid, dci->dci_co_uuid, &dci->dci_xid, dci->dci_epoch, + opc, bitmap_sz, bitmap, &results); + if (len < 0) + D_GOTO(out, rc = len); + + if (opc == DTX_COLL_CHECK) { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) + dtx_merge_check_result(&rc, results[i]); + } + + /* + * For force check case, if no shard has been committed, we cannot trust the result + * of -DER_NONEXIST, instead, returning -DER_INPROGRESS to make the leader to retry. + */ + if (force_check && rc == -DER_NONEXIST) + D_GOTO(out, rc = -DER_INPROGRESS); + } else { + for (i = 0; i < len; i++) { + if (bitmap == NULL || isset(bitmap, i)) { + if (results[i] >= 0) + dco->dco_misc += results[i]; + else if (results[i] != -DER_NONEXIST && rc == 0) + rc = results[i]; + } + } + } + +out: + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, + "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", + opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + + dco->dco_status = rc; + rc = crt_reply_send(rpc); + if (rc < 0) + D_ERROR("Failed to send collective RPC %p reply: "DF_RC"\n", rpc, DP_RC(rc)); + + dtx_coll_entry_put(dcpa.dcpa_dce); + D_FREE(results); +} + static int dtx_init(void) { int rc; dtx_agg_thd_cnt_up = DTX_AGG_THD_CNT_DEF; - d_getenv_int("DAOS_DTX_AGG_THD_CNT", &dtx_agg_thd_cnt_up); + d_getenv_uint32_t("DAOS_DTX_AGG_THD_CNT", &dtx_agg_thd_cnt_up); if (dtx_agg_thd_cnt_up < DTX_AGG_THD_CNT_MIN || dtx_agg_thd_cnt_up > DTX_AGG_THD_CNT_MAX) { D_WARN("Invalid DTX aggregation count threshold %u, the valid range is [%u, %u], " "use the default value %u\n", dtx_agg_thd_cnt_up, DTX_AGG_THD_CNT_MIN, @@ -393,7 +524,7 @@ dtx_init(void) D_INFO("Set DTX aggregation count threshold as %u (entries)\n", dtx_agg_thd_cnt_up); dtx_agg_thd_age_up = DTX_AGG_THD_AGE_DEF; - d_getenv_int("DAOS_DTX_AGG_THD_AGE", &dtx_agg_thd_age_up); + d_getenv_uint32_t("DAOS_DTX_AGG_THD_AGE", &dtx_agg_thd_age_up); if (dtx_agg_thd_age_up < DTX_AGG_THD_AGE_MIN || dtx_agg_thd_age_up > DTX_AGG_THD_AGE_MAX) { D_WARN("Invalid DTX aggregation age threshold %u, the valid range is [%u, %u], " "use the default value %u\n", dtx_agg_thd_age_up, DTX_AGG_THD_AGE_MIN, @@ -405,7 +536,7 @@ dtx_init(void) D_INFO("Set DTX aggregation time threshold as %u (seconds)\n", dtx_agg_thd_age_up); dtx_batched_ult_max = DTX_BATCHED_ULT_DEF; - d_getenv_int("DAOS_DTX_BATCHED_ULT_MAX", &dtx_batched_ult_max); + d_getenv_uint32_t("DAOS_DTX_BATCHED_ULT_MAX", &dtx_batched_ult_max); D_INFO("Set the max count of DTX batched commit ULTs as %d\n", dtx_batched_ult_max); rc = dbtree_class_register(DBTREE_CLASS_DTX_CF, diff --git a/src/engine/init.c b/src/engine/init.c index eb3bca9edb1..23379878700 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -454,7 +454,7 @@ set_abt_max_num_xstreams(int n) if (value == NULL) return -DER_NOMEM; D_INFO("Setting %s to %s\n", name, value); - rc = setenv(name, value, 1 /* overwrite */); + rc = d_setenv(name, value, 1 /* overwrite */); D_FREE(value); if (rc != 0) return daos_errno2der(errno); diff --git a/src/engine/srv.c b/src/engine/srv.c index aa6cbd706e8..df0733ed638 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1037,7 +1037,7 @@ dss_xstreams_init(void) D_INFO("ULT mmap()'ed stack allocation is disabled.\n"); #endif - d_getenv_int("DAOS_SCHED_RELAX_INTVL", &sched_relax_intvl); + d_getenv_uint("DAOS_SCHED_RELAX_INTVL", &sched_relax_intvl); if (sched_relax_intvl == 0 || sched_relax_intvl > SCHED_RELAX_INTVL_MAX) { D_WARN("Invalid relax interval %u, set to default %u msecs.\n", @@ -1059,7 +1059,7 @@ dss_xstreams_init(void) D_INFO("CPU relax mode is set to [%s]\n", sched_relax_mode2str(sched_relax_mode)); - d_getenv_int("DAOS_SCHED_UNIT_RUNTIME_MAX", &sched_unit_runtime_max); + d_getenv_uint("DAOS_SCHED_UNIT_RUNTIME_MAX", &sched_unit_runtime_max); d_getenv_bool("DAOS_SCHED_WATCHDOG_ALL", &sched_watchdog_all); /* start the execution streams */ diff --git a/src/engine/ult.c b/src/engine/ult.c index 204381755fb..a6e4257107e 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -97,6 +97,9 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, int xs_nr; int rc; int tid; + int tgt_id = dss_get_module_info()->dmi_tgt_id; + uint32_t bm_len; + bool self = false; if (ops == NULL || args == NULL || ops->co_func == NULL) { D_DEBUG(DB_MD, "mandatory args missing dss_collective_reduce"); @@ -115,6 +118,7 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, return -DER_CANCELED; } + bm_len = args->ca_tgt_bitmap_sz << 3; xs_nr = dss_tgt_nr; stream_args = &args->ca_stream_args; D_ALLOC_ARRAY(stream_args->csa_streams, xs_nr); @@ -156,19 +160,18 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, stream = &stream_args->csa_streams[tid]; stream->st_coll_args = &carg; - if (args->ca_exclude_tgts_cnt) { - int i; - - for (i = 0; i < args->ca_exclude_tgts_cnt; i++) - if (args->ca_exclude_tgts[i] == tid) - break; - - if (i < args->ca_exclude_tgts_cnt) { + if (args->ca_tgt_bitmap != NULL) { + if (tid >= bm_len || isclr(args->ca_tgt_bitmap, tid)) { D_DEBUG(DB_TRACE, "Skip tgt %d\n", tid); rc = ABT_future_set(future, (void *)stream); D_ASSERTF(rc == ABT_SUCCESS, "%d\n", rc); continue; } + + if (tgt_id == tid && flags & DSS_USE_CURRENT_ULT) { + self = true; + continue; + } } dx = dss_get_xstream(DSS_MAIN_XS_ID(tid)); @@ -209,6 +212,12 @@ dss_collective_reduce_internal(struct dss_coll_ops *ops, } } + if (self) { + stream = &stream_args->csa_streams[tgt_id]; + stream->st_coll_args = &carg; + collective_func(stream); + } + ABT_future_wait(future); rc = aggregator.at_rc; @@ -322,6 +331,45 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags) return dss_collective_internal(func, arg, true, flags); } +int +dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz) +{ + uint8_t *bitmap = NULL; + uint32_t size = ((dss_tgt_nr - 1) >> 3) + 1; + uint32_t bits = size << 3; + int rc = 0; + int i; + + D_ALLOC(bitmap, size); + if (bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < size; i++) + bitmap[i] = 0xff; + + for (i = dss_tgt_nr; i < bits; i++) + clrbit(bitmap, i); + + if (exclude_tgts == NULL) + goto out; + + for (i = 0; i < exclude_cnt; i++) { + D_ASSERT(exclude_tgts[i] < dss_tgt_nr); + clrbit(bitmap, exclude_tgts[i]); + } + +out: + if (rc == 0) { + *p_bitmap = bitmap; + *bitmap_sz = size; + } else { + D_ERROR("Failed to build bitmap for collective task: "DF_RC"\n", DP_RC(rc)); + } + + return rc; +} + /* ============== ULT create functions =================================== */ static inline int diff --git a/src/gurt/dlog.c b/src/gurt/dlog.c index 6b34a5035ab..7cbce2fa7b6 100644 --- a/src/gurt/dlog.c +++ b/src/gurt/dlog.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -841,26 +841,29 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, mst.flush_pri = DLOG_WARN; mst.log_id_cb = log_id_cb; - env = getenv(D_LOG_FLUSH_ENV); + d_agetenv_str(&env, D_LOG_FLUSH_ENV); if (env) { pri = d_log_str2pri(env, strlen(env) + 1); if (pri != -1) mst.flush_pri = pri; + d_free_env_str(&env); } - env = getenv(D_LOG_TRUNCATE_ENV); + d_agetenv_str(&env, D_LOG_TRUNCATE_ENV); if (env != NULL && atoi(env) > 0) truncate = 1; + d_free_env_str(&env); - env = getenv(D_LOG_SIZE_ENV); + d_agetenv_str(&env, D_LOG_SIZE_ENV); if (env != NULL) { log_size = d_getenv_size(env); if (log_size < LOG_SIZE_MIN) log_size = LOG_SIZE_MIN; + d_free_env_str(&env); } - env = getenv(D_LOG_FILE_APPEND_PID_ENV); + d_agetenv_str(&env, D_LOG_FILE_APPEND_PID_ENV); if (logfile != NULL && env != NULL) { if (strcmp(env, "0") != 0) { rc = asprintf(&buffer, "%s.%d", logfile, getpid()); @@ -872,10 +875,12 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, "continuing.\n"); } } + d_free_env_str(&env); - env = getenv(D_LOG_FILE_APPEND_RANK_ENV); + d_agetenv_str(&env, D_LOG_FILE_APPEND_RANK_ENV); if (env && strcmp(env, "0") != 0) mst.append_rank = true; + d_free_env_str(&env); /* quick sanity check (mst.tag is non-null if already open) */ if (d_log_xst.tag || !tag || @@ -910,9 +915,10 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, int log_flags = O_RDWR | O_CREAT; struct stat st; - env = getenv(D_LOG_STDERR_IN_LOG_ENV); + d_agetenv_str(&env, D_LOG_STDERR_IN_LOG_ENV); if (env != NULL && atoi(env) > 0) merge_stderr = true; + d_free_env_str(&env); if (!truncate) log_flags |= O_APPEND; @@ -1074,24 +1080,35 @@ bool d_logfac_is_enabled(const char *fac_name) { char *ddsubsys_env; char *ddsubsys_fac; - int len = strlen(fac_name); + int len = strlen(fac_name); + bool rc; /* read env DD_SUBSYS to enable corresponding facilities */ - ddsubsys_env = getenv(DD_FAC_ENV); + d_agetenv_str(&ddsubsys_env, DD_FAC_ENV); if (ddsubsys_env == NULL) return true; /* enable all facilities by default */ - if (strncasecmp(ddsubsys_env, DD_FAC_ALL, strlen(DD_FAC_ALL)) == 0) - return true; /* enable all facilities with DD_SUBSYS=all */ + if (strncasecmp(ddsubsys_env, DD_FAC_ALL, strlen(DD_FAC_ALL)) == 0) { + rc = true; /* enable all facilities with DD_SUBSYS=all */ + goto out; + } ddsubsys_fac = strcasestr(ddsubsys_env, fac_name); - if (ddsubsys_fac == NULL) - return false; + if (ddsubsys_fac == NULL) { + rc = false; + goto out; + } - if (ddsubsys_fac[len] != '\0' && ddsubsys_fac[len] != ',') - return false; + if (ddsubsys_fac[len] != '\0' && ddsubsys_fac[len] != ',') { + rc = false; + goto out; + } + + rc = true; - return true; +out: + d_free_env_str(&ddsubsys_env); + return rc; } /* diff --git a/src/gurt/examples/telem_consumer_example.c b/src/gurt/examples/telem_consumer_example.c index 6b7b1653a16..53cc0311d7f 100644 --- a/src/gurt/examples/telem_consumer_example.c +++ b/src/gurt/examples/telem_consumer_example.c @@ -7,8 +7,8 @@ * This file shows an example of using the telemetry API to consume metrics */ -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_consumer.h" +#include +#include /** * An example that shows how metrics are read. diff --git a/src/gurt/examples/telem_producer_example.c b/src/gurt/examples/telem_producer_example.c index 13c324d144b..45e7c524f3d 100644 --- a/src/gurt/examples/telem_producer_example.c +++ b/src/gurt/examples/telem_producer_example.c @@ -7,8 +7,8 @@ * This file shows an example of using the telemetry API to produce metrics */ -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_producer.h" +#include +#include /** * A sample function that creates and incremements a metric for a loop counter diff --git a/src/gurt/fault_inject.c b/src/gurt/fault_inject.c index b102d3664b1..4ffbd2c40e4 100644 --- a/src/gurt/fault_inject.c +++ b/src/gurt/fault_inject.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -534,7 +534,7 @@ d_fault_inject_init(void) } D_RWLOCK_UNLOCK(&d_fi_gdata.dfg_rwlock); - config_file = getenv(D_FAULT_CONFIG_ENV); + d_agetenv_str(&config_file, D_FAULT_CONFIG_ENV); if (config_file == NULL || strlen(config_file) == 0) { D_INFO("No config file, fault injection is OFF.\n"); D_GOTO(out, rc); @@ -616,6 +616,7 @@ d_fault_inject_init(void) out: if (fp) fclose(fp); + d_free_env_str(&config_file); return rc; } diff --git a/src/gurt/misc.c b/src/gurt/misc.c index 46e019e5ee8..de0a1ae6fd7 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,12 +17,16 @@ #include #include #include +#include +#include #include #include #include +#define UINT64_MAX_STR "18446744073709551615" + /* state buffer for DAOS rand and srand calls, NOT thread safe */ static struct drand48_data randBuffer = {0}; @@ -947,149 +951,487 @@ d_rank_range_list_free(d_rank_range_list_t *range_list) } static inline bool -dis_integer_str(char *str) +dis_unsigned_str(char *str) { - char *p; + char *eos; - p = str; - if (p == NULL || strlen(p) == 0) + if (str == NULL || str[0] == '\0') return false; - while (*p != '\0') { - if (*p <= '9' && *p >= '0') { - p++; - continue; - } else { - return false; - } - } + eos = str + (sizeof(UINT64_MAX_STR) - 1); + while (str != eos && *str != '\0' && *str >= '0' && *str <= '9') + ++str; - return true; + return *str == '\0'; } static inline bool dis_single_char_str(char *str) { - return strlen(str) == 1; + return strnlen(str, 2) == 1; +} + +/** + * Overloads to hook the unsafe getenv()/[un]setenv()/putenv()/clearenv() + * functions from glibc. + */ +static pthread_rwlock_t d_env_lock = PTHREAD_RWLOCK_INITIALIZER; + +static inline void +d_env_rwlock_rdlock() +{ + int rc; + + rc = pthread_rwlock_rdlock(&d_env_lock); + if (rc != 0) + fprintf(stderr, "d_env_rwlock_rdlock(%p) rc=%d %s\n", &d_env_lock, rc, + strerror(rc)); + assert(rc == 0); +} + +static inline void +d_env_rwlock_wrlock() +{ + int rc; + + rc = pthread_rwlock_wrlock(&d_env_lock); + if (rc != 0) + fprintf(stderr, "d_env_rwlock_wrlock(%p) rc=%d %s\n", &d_env_lock, rc, + strerror(rc)); + assert(rc == 0); +} + +static inline void +d_env_rwlock_unlock() +{ + int rc; + + rc = pthread_rwlock_unlock(&d_env_lock); + if (rc != 0) + fprintf(stderr, "d_env_rwlock_unlock(%p) rc=%d %s\n", &d_env_lock, rc, + strerror(rc)); + assert(rc == 0); +} + +/** + * Check if and environment variable is defined. + * + * \param[in] name name of the environment variable. + * \return true iff the environment variable is defined. + */ +bool +d_isenv_def(char *name) +{ + char *env; + + d_env_rwlock_rdlock(); + env = getenv(name); + d_env_rwlock_unlock(); + + return env != NULL; +} + +/** + * Get a string type environment variables + * + * \param[in,out] str_val returned value of the ENV. Will not change the original + * value if ENV is not set. + * \param[in] str_size Size of the input string. + * \param[in] name name of the environment variable. + * \return 0 on success, a negative value on error. + */ +int +d_getenv_str(char *str_val, size_t str_size, const char *name) +{ + char *tmp; + int len; + int rc = -DER_SUCCESS; + + assert(name != NULL); + assert(str_val != NULL); + assert(str_size > 0); + + d_env_rwlock_rdlock(); + + tmp = getenv(name); + if (tmp == NULL) { + rc = -DER_NONEXIST; + goto out; + } + + len = strnlen(tmp, str_size); + memcpy(str_val, tmp, len); + + if (len == str_size) { + fprintf(stderr, "ENV '%s' has been truncated\n", name); + rc = -DER_TRUNC; + --len; + } + str_val[len] = '\0'; + +out: + d_env_rwlock_unlock(); + + return rc; +} + +/** + * Get a string type environment variables + * + * \param[in,out] str_val returned value of the ENV on success, NULL on error. + * \param[in] name name of the environment variable. + * \return 0 on success, a negative value on error. + */ +int +d_agetenv_str(char **str_val, const char *name) +{ + char *env; + char *tmp; + int rc; + + assert(name != NULL); + + *str_val = NULL; + + d_env_rwlock_rdlock(); + + env = getenv(name); + if (env == NULL) { + rc = -DER_NONEXIST; + goto out; + } + + /* DAOS-14532 There is no limit to environment variable size */ + tmp = strdup(env); + if (tmp == NULL) { + rc = -DER_NOMEM; + goto out; + } + + *str_val = tmp; + rc = -DER_SUCCESS; + +out: + d_env_rwlock_unlock(); + + return rc; +} + +/** + * Frees memory space of an environment string value. + * + * \param[in,out] str_val Copy of an environment string value. + */ +void +d_free_env_str(char **str_val) +{ + assert(str_val != NULL); + + if (*str_val == NULL) + return; + + free(*str_val); + *str_val = NULL; } /** - * get a bool type environment variables + * get a bool type environment variables. * - * \param[in] env name of the environment variable + * \param[in] name name of the environment variable. * \param[in,out] bool_val returned value of the ENV. Will not change the original * value if ENV is not set. Set as false if the env is set to * 0, otherwise set as true. + * \return 0 on success, a negative value on error. */ -void -d_getenv_bool(const char *env, bool *bool_val) +int +d_getenv_bool(const char *name, bool *bool_val) { - char *env_val; + char *env; + char *endptr; + long long val; + int rc; - if (env == NULL) - return; - D_ASSERT(bool_val != NULL); + assert(name != NULL); + assert(bool_val != NULL); - env_val = getenv(env); - if (!env_val) - return; + d_env_rwlock_rdlock(); + + env = getenv(name); + if (env == NULL) { + rc = -DER_NONEXIST; + goto out; + } /* treats any valid non-integer string as true */ - if (!dis_integer_str(env_val)) - *bool_val = true; + errno = 0; + val = strtoll(env, &endptr, 10); + *bool_val = errno != 0 || endptr == env || *endptr != '\0' || val != 0; + rc = -DER_SUCCESS; - *bool_val = (atoi(env_val) == 0 ? false : true); +out: + d_env_rwlock_unlock(); + + return rc; } /** - * get single character environment variable + * get single character environment variable. * - * \param[in] env name of the environment variable - * \param[in,out] char_val returned value of the ENV. Will not change the original value + * \param[in] name name of the environment variable. + * \param[in,out] char_val returned value of the ENV. Will not change the original + * value. + * \return 0 on success, a negative value on error. */ -void -d_getenv_char(const char *env, char *char_val) +int +d_getenv_char(const char *name, char *char_val) { - char *env_val; + char *env; + int rc; - if (env == NULL || char_val == NULL) - return; + assert(name != NULL); + assert(char_val != NULL); - env_val = getenv(env); - if (!env_val) - return; + d_env_rwlock_rdlock(); - if (!dis_single_char_str(env_val)) { - D_ERROR("ENV %s is not single character.\n", env_val); - return; + env = getenv(name); + if (env == NULL) { + rc = -DER_NONEXIST; + goto out; } - *char_val = *env_val; + + if (!dis_single_char_str(env)) { + rc = -DER_INVAL; + goto out; + } + + *char_val = *env; + rc = -DER_SUCCESS; + +out: + d_env_rwlock_unlock(); + + return rc; +} + +static int +d_getenv_ull(unsigned long long *val, const char *name) +{ + char *env; + char *endptr; + unsigned long long tmp; + int rc; + + assert(val != NULL); + assert(name != NULL); + + d_env_rwlock_rdlock(); + env = getenv(name); + if (env == NULL) { + rc = -DER_NONEXIST; + goto out; + } + + if (!dis_unsigned_str(env)) { + rc = -DER_INVAL; + goto out; + } + + errno = 0; + tmp = strtoull(env, &endptr, 0); + if (errno != 0 || endptr == env || *endptr != '\0') { + rc = -DER_INVAL; + goto out; + } + + *val = tmp; + rc = -DER_SUCCESS; + +out: + d_env_rwlock_unlock(); + + return rc; } /** - * get an integer type environment variables + * get an unsigned integer type environment variables. * - * \param[in] env name of the environment variable - * \param[in,out] int_val returned value of the ENV. Will not change the original value if ENV - * is not set or set as a - * non-integer value. + * \param[in] name name of the environment variable. + * \param[in,out] uint_val returned value of the ENV. Will not change the original + * value if ENV is not set or set as a non-integer value. + * \return 0 on success, a negative value on error. */ -void -d_getenv_int(const char *env, unsigned *int_val) +int +d_getenv_uint(const char *name, unsigned *uint_val) { - char *env_val; - unsigned value; + int rc; + unsigned long long tmp; - if (env == NULL || int_val == NULL) - return; + assert(uint_val != NULL); + assert(name != NULL); - env_val = getenv(env); - if (!env_val) - return; + rc = d_getenv_ull(&tmp, name); + if (rc != -DER_SUCCESS) + return rc; - if (!dis_integer_str(env_val)) { - D_ERROR("ENV %s is not integer.\n", env_val); - return; +#if UINT_MAX != ULLONG_MAX + assert(sizeof(unsigned) < sizeof(unsigned long long)); + if (tmp > UINT_MAX) { + return -DER_INVAL; } +#endif - value = atoi(env_val); - D_DEBUG(DB_TRACE, "get ENV %s as %d.\n", env, value); - *int_val = value; + *uint_val = (unsigned)tmp; + return -DER_SUCCESS; } +/** + * get a 32bits unsigned integer type environment variables + * + * \param[in] name name of the environment variable. + * \param[in,out] uint32_val returned value of the ENV. Will not change the original + * value if ENV is not set or set as a non-integer value. + * \return 0 on success, a negative value on error. + */ int -d_getenv_uint64_t(const char *env, uint64_t *val) -{ - char *env_val; - size_t env_len; - int matched; - uint64_t new_val; - int count; - - env_val = getenv(env); - if (!env_val) { - D_DEBUG(DB_TRACE, "ENV '%s' unchanged at %"PRId64"\n", env, *val); - return -DER_NONEXIST; - } +d_getenv_uint32_t(const char *name, uint32_t *uint32_val) +{ + int rc; + unsigned long long tmp; + + assert(uint32_val != NULL); + assert(name != NULL); - env_len = strnlen(env_val, 128); - if (env_len == 128) { - D_ERROR("ENV '%s' is invalid\n", env); + rc = d_getenv_ull(&tmp, name); + if (rc != -DER_SUCCESS) + return rc; + +#if UINT32_MAX != ULLONG_MAX + assert(sizeof(uint32_t) < sizeof(unsigned long long)); + if (tmp > UINT32_MAX) { return -DER_INVAL; } +#endif + + *uint32_val = (uint32_t)tmp; + return -DER_SUCCESS; +} + +/** + * get a 64bits unsigned integer type environment variables + * + * \param[in] name name of the environment variable. + * \param[in,out] uint64_val returned value of the ENV. Will not change the original + * value if ENV is not set or set as a non-integer value. + * \return 0 on success, a negative value on error. + */ +int +d_getenv_uint64_t(const char *name, uint64_t *uint64_val) +{ + int rc; + unsigned long long tmp; + + assert(uint64_val != NULL); + assert(name != NULL); - /* Now do scanf, check that the number was matched, and there are no extra unmatched - * characters at the end. - */ - matched = sscanf(env_val, "%"PRId64"%n", &new_val, &count); - if (matched == 1 && env_len == count) { - *val = new_val; - D_DEBUG(DB_TRACE, "ENV '%s' set to %"PRId64"\n", env, *val); - return -DER_SUCCESS; + rc = d_getenv_ull(&tmp, name); + if (rc != -DER_SUCCESS) + return rc; + +#if UINT64_MAX != ULLONG_MAX + assert(sizeof(uint64_t) < sizeof(unsigned long long)); + if (tmp > UINT64_MAX) { + return -DER_INVAL; } +#endif - D_ERROR("ENV '%s' is invalid: '%s'\n", env, env_val); - return -DER_INVAL; + *uint64_val = (uint64_t)tmp; + return -DER_SUCCESS; +} + +/** + * Thread safe wrapper of the libc putenv() function. + * + * \param[in] name name of the environment variable. + * \return 0 on success, a negative value on error. + */ +int +d_putenv(char *name) +{ + int env_errno; + int rc; + + d_env_rwlock_wrlock(); + errno = 0; + rc = putenv(name); + env_errno = errno; + d_env_rwlock_unlock(); + + errno = env_errno; + return rc; +} + +/** + * Thread safe wrapper of the libc setenv() function. + * + * \param[in] name name of the environment variable. + * \param[in] value value of the environment variable. + * \param[in] overwrite overwrite when nonzero. + * \return 0 on success, a negative value on error. + */ +int +d_setenv(const char *name, const char *value, int overwrite) +{ + int env_errno; + int rc; + + d_env_rwlock_wrlock(); + errno = 0; + rc = setenv(name, value, overwrite); + env_errno = errno; + d_env_rwlock_unlock(); + + errno = env_errno; + return rc; +} + +/** + * Thread safe wrapper of the libc unsetenv() function. + * + * \param[in] name name of the environment variable. + * \return 0 on success, a negative value on error. + */ +int +d_unsetenv(const char *name) +{ + int env_errno; + int rc; + + d_env_rwlock_wrlock(); + errno = 0; + rc = unsetenv(name); + env_errno = errno; + d_env_rwlock_unlock(); + + errno = env_errno; + return rc; +} + +/** + * Thread safe wrapper of the libc clearenv() function. + * + * \param[in] name name of the environment variable. + * \return 0 on success, a negative value on error. + */ +int +d_clearenv(void) +{ + int rc; + + d_env_rwlock_wrlock(); + rc = clearenv(); + d_env_rwlock_unlock(); + + return rc; } /** @@ -1339,119 +1681,3 @@ d_vec_pointers_append(struct d_vec_pointers *pointers, void *pointer) pointers->p_len++; return 0; } - -/** - * Overloads to hook the unsafe getenv()/[un]setenv()/putenv()/clearenv() - * functions from glibc. - * Libgurt is the preferred place for this as it is the lowest layer in DAOS, - * so it will be the earliest to be loaded and will ensure the hook to be - * installed as early as possible and could prevent usage of LD_PRELOAD. - * The idea is to strengthen all the environment APIs by using a common lock. - * - * XXX this will address the main lack of multi-thread protection in the Glibc - * APIs but do not handle all unsafe use-cases (like the change/removal of an - * env var when its value address has already been grabbed by a previous - * getenv(), ...). - */ - -static pthread_rwlock_t hook_env_lock = PTHREAD_RWLOCK_INITIALIZER; -static char *(* ATOMIC real_getenv)(const char *); -static int (* ATOMIC real_putenv)(char *); -static int (* ATOMIC real_setenv)(const char *, const char *, int); -static int (* ATOMIC real_unsetenv)(const char *); -static int (* ATOMIC real_clearenv)(void); - -static void bind_libc_symbol(void **real_ptr_addr, const char *name) -{ - void *real_temp; - - /* XXX __atomic_*() built-ins are used to avoid the need to cast - * each of the ATOMIC pointers of functions, that seems to be - * required to make Intel compiler happy ... - */ - if (__atomic_load_n(real_ptr_addr, __ATOMIC_RELAXED) == NULL) { - /* libc should be already loaded ... */ - real_temp = dlsym(RTLD_NEXT, name); - if (real_temp == NULL) { - /* try after loading libc now */ - void *handle; - - handle = dlopen("libc.so.6", RTLD_LAZY); - D_ASSERT(handle != NULL); - real_temp = dlsym(handle, name); - D_ASSERT(real_temp != NULL); - } - __atomic_store_n(real_ptr_addr, real_temp, __ATOMIC_RELAXED); - } -} - -static pthread_once_t init_real_symbols_flag = PTHREAD_ONCE_INIT; - -static void init_real_symbols(void) -{ - bind_libc_symbol((void **)&real_getenv, "getenv"); - bind_libc_symbol((void **)&real_putenv, "putenv"); - bind_libc_symbol((void **)&real_setenv, "setenv"); - bind_libc_symbol((void **)&real_unsetenv, "unsetenv"); - bind_libc_symbol((void **)&real_clearenv, "clearenv"); -} - -char *getenv(const char *name) -{ - char *p; - - pthread_once(&init_real_symbols_flag, init_real_symbols); - D_RWLOCK_RDLOCK(&hook_env_lock); - p = real_getenv(name); - D_RWLOCK_UNLOCK(&hook_env_lock); - - return p; -} - -int putenv(char *name) -{ - int rc; - - pthread_once(&init_real_symbols_flag, init_real_symbols); - D_RWLOCK_WRLOCK(&hook_env_lock); - rc = real_putenv(name); - D_RWLOCK_UNLOCK(&hook_env_lock); - - return rc; -} - -int setenv(const char *name, const char *value, int overwrite) -{ - int rc; - - pthread_once(&init_real_symbols_flag, init_real_symbols); - D_RWLOCK_WRLOCK(&hook_env_lock); - rc = real_setenv(name, value, overwrite); - D_RWLOCK_UNLOCK(&hook_env_lock); - - return rc; -} - -int unsetenv(const char *name) -{ - int rc; - - pthread_once(&init_real_symbols_flag, init_real_symbols); - D_RWLOCK_WRLOCK(&hook_env_lock); - rc = real_unsetenv(name); - D_RWLOCK_UNLOCK(&hook_env_lock); - - return rc; -} - -int clearenv(void) -{ - int rc; - - pthread_once(&init_real_symbols_flag, init_real_symbols); - D_RWLOCK_WRLOCK(&hook_env_lock); - rc = real_clearenv(); - D_RWLOCK_UNLOCK(&hook_env_lock); - - return rc; -} diff --git a/src/gurt/slab.c b/src/gurt/slab.c index 337daa31eb7..10b1a041990 100644 --- a/src/gurt/slab.c +++ b/src/gurt/slab.c @@ -9,7 +9,7 @@ #include #include -#include "gurt/slab.h" +#include static void debug_dump(struct d_slab_type *type) diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c index 3294c066226..bdd963207bb 100644 --- a/src/gurt/telemetry.c +++ b/src/gurt/telemetry.c @@ -16,9 +16,9 @@ #include #include #include -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_producer.h" -#include "gurt/telemetry_consumer.h" +#include +#include +#include /** minimal list of shared memory regions with a global ID */ struct shmem_region_list { diff --git a/src/gurt/tests/SConscript b/src/gurt/tests/SConscript index e5ca3dc6951..a773b12812a 100644 --- a/src/gurt/tests/SConscript +++ b/src/gurt/tests/SConscript @@ -20,13 +20,18 @@ def scons(): test_env.require('mercury', 'uuid') test_env.AppendUnique(LIBS=['pthread', 'cmocka', 'm', 'dl']) test_env.AppendUnique(CXXFLAGS=['-std=c++0x']) - tests = [] + mocks_ld_script = f"{Dir('.').srcnode()}/mocks-gurt-ld-opts" + test_env.AppendUnique(LINKFLAGS=[f'-Wl,@{mocks_ld_script}']) + mocks = test_env.SharedObject(['mocks_gurt.c']) + Export('mocks') + + tests = [] for test in TEST_SRC: testobj = test_env.Object(test) testname = os.path.splitext(test)[0] testprog = test_env.d_test_program(target=testname, - source=testobj + gurt_targets, + source=testobj + gurt_targets + [mocks], LIBS=test_env["LIBS"] + ['yaml']) tests.append(testprog) diff --git a/src/gurt/tests/mocks-gurt-ld-opts b/src/gurt/tests/mocks-gurt-ld-opts new file mode 100644 index 00000000000..9abe80aa90f --- /dev/null +++ b/src/gurt/tests/mocks-gurt-ld-opts @@ -0,0 +1,2 @@ +--wrap=getenv +--wrap=strdup diff --git a/src/gurt/tests/mocks_gurt.c b/src/gurt/tests/mocks_gurt.c new file mode 100644 index 00000000000..db74590cac9 --- /dev/null +++ b/src/gurt/tests/mocks_gurt.c @@ -0,0 +1,62 @@ +/* + * (C) Copyright 2018-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include + +#include "mocks_gurt.h" + +static bool mock_getenv = false; +char *getenv_return; /* value to be returned */ + +void +mock_getenv_setup(void) +{ + mock_getenv = true; +} + +void +mock_getenv_teardown(void) +{ + mock_getenv = false; +} + +char * +__real_getenv(const char *name); + +char * +__wrap_getenv(const char *name) +{ + if (mock_getenv) + return getenv_return; + + return __real_getenv(name); +} + +static bool mock_strdup = false; + +void +mock_strdup_setup(void) +{ + mock_strdup = true; +} + +void +mock_strdup_teardown(void) +{ + mock_strdup = false; +} + +char * +__real_strdup(const char *name); + +char * +__wrap_strdup(const char *name) +{ + if (mock_strdup) + return NULL; + + return __real_strdup(name); +} diff --git a/src/gurt/tests/mocks_gurt.h b/src/gurt/tests/mocks_gurt.h new file mode 100644 index 00000000000..7ac7c4bc157 --- /dev/null +++ b/src/gurt/tests/mocks_gurt.h @@ -0,0 +1,24 @@ +/* + * (C) Copyright 2018-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DAOS_MOCKS_GURT_H__ +#define __DAOS_MOCKS_GURT_H__ + +#include + +extern char *getenv_return; /* value to be returned */ + +void +mock_getenv_setup(void); +void +mock_getenv_teardown(void); + +void +mock_strdup_setup(void); +void +mock_strdup_teardown(void); + +#endif /* __DAOS_MOCKS_GURT_H__ */ diff --git a/src/gurt/tests/test_gurt.c b/src/gurt/tests/test_gurt.c index 2bfddd37012..49db0a883e6 100644 --- a/src/gurt/tests/test_gurt.c +++ b/src/gurt/tests/test_gurt.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,6 +21,7 @@ #include #include #include +#include "mocks_gurt.h" /* machine epsilon */ #define EPSILON (1.0E-16) @@ -120,11 +121,11 @@ test_d_errstr(void **state) /* Check the boundary at the end of the GURT error numbers, this will need updating if * additional error numbers are added. */ - value = d_errstr(-DER_HG_FATAL); - assert_string_equal(value, "DER_HG_FATAL"); - value = d_errstr(-1045); - assert_string_equal(value, "DER_HG_FATAL"); - value = d_errstr(-(DER_HG_FATAL + 1)); + value = d_errstr(-DER_QUOTA_LIMIT); + assert_string_equal(value, "DER_QUOTA_LIMIT"); + value = d_errstr(-1046); + assert_string_equal(value, "DER_QUOTA_LIMIT"); + value = d_errstr(-(DER_QUOTA_LIMIT + 1)); assert_string_equal(value, "DER_UNKNOWN"); /* Check the end of the DAOS error numbers. */ @@ -2076,6 +2077,372 @@ test_d_rank_list_dup_sort_uniq(void **state) } } +static int +setup_getenv_mocks(void **state) +{ + mock_getenv_setup(); + return 0; +} + +static int +teardown_getenv_mocks(void **state) +{ + mock_getenv_teardown(); + return 0; +} + +static void +test_d_getenv_str(void **state) +{ + char env[] = "012"; + int rc = 0; + + getenv_return = "bar"; + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_SUCCESS); + assert_string_equal(env, "bar"); + + getenv_return = ""; + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_SUCCESS); + assert_string_equal(env, ""); + + getenv_return = "too long string"; + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_TRUNC); + assert_string_equal(env, "too"); + + getenv_return = "too long string"; + rc = d_getenv_str(env, 2, "foo"); + assert_int_equal(rc, -DER_TRUNC); + assert_string_equal(env, "t"); + + assert_string_equal(env, "t"); + getenv_return = "too long string"; + rc = d_getenv_str(env, 1, "foo"); + assert_int_equal(rc, -DER_TRUNC); + assert_string_equal(env, ""); + + getenv_return = NULL; + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_NONEXIST); + assert_string_equal(env, ""); +} + +static void +test_d_agetenv_str(void **state) +{ + char *env = NULL; + int rc = 0; + + getenv_return = "bar"; + rc = d_agetenv_str(&env, "foo"); + assert_int_equal(rc, -DER_SUCCESS); + assert_non_null(env); + assert_string_equal(env, "bar"); + d_free_env_str(&env); + assert_null(env); + + getenv_return = ""; + rc = d_agetenv_str(&env, "foo"); + assert_int_equal(rc, -DER_SUCCESS); + assert_non_null(env); + assert_string_equal(env, ""); + d_free_env_str(&env); + assert_null(env); + + getenv_return = NULL; + env = (char *)0x1; + rc = d_agetenv_str(&env, "foo"); + assert_int_equal(rc, -DER_NONEXIST); + assert_null(env); + + getenv_return = "bar"; + env = (char *)0x1; + mock_strdup_setup(); + rc = d_agetenv_str(&env, "foo"); + mock_strdup_teardown(); + assert_int_equal(rc, -DER_NOMEM); + assert_null(env); +} + +static void +test_d_getenv_bool(void **state) +{ + bool val = false; + int rc = 0; + + getenv_return = "bar"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "true"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "false"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "1"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "999999999999999999999999999999999999"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "0dmlnv"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "1"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = false; + getenv_return = "03"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = true; + getenv_return = "0"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_false(val); + + val = true; + getenv_return = " 0"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_false(val); + + val = false; + getenv_return = "0 "; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val); + + val = true; + getenv_return = "0000000"; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_false(val); + + val = true; + getenv_return = NULL; + rc = d_getenv_bool("foo", &val); + assert_int_equal(rc, -DER_NONEXIST); + assert_true(val); +} + +static void +test_d_getenv_char(void **state) +{ + char val = '\0'; + int rc = 0; + + getenv_return = "a"; + rc = d_getenv_char("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == 'a'); + + getenv_return = ""; + rc = d_getenv_char("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 'a'); + + getenv_return = "booo"; + rc = d_getenv_char("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 'a'); + + getenv_return = NULL; + rc = d_getenv_char("foo", &val); + assert_int_equal(rc, -DER_NONEXIST); + assert_true(val == 'a'); +} + +static void +test_d_getenv_uint(void **state) +{ + unsigned val = 0; + int rc = 0; + + getenv_return = "4294967295"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT_MAX); + + getenv_return = "42"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == 42); + + getenv_return = "4294967296"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "-42"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "booo"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "42booo"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = NULL; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_NONEXIST); + assert_true(val == 42); +} + +static void +test_d_getenv_uint32_t(void **state) +{ + uint32_t val = 0; + int rc = 0; + + getenv_return = "4294967295"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT32_MAX); + + getenv_return = "42"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == 42); + + getenv_return = "4294967296"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "-42"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "booo"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "42booo"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = ""; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = NULL; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_NONEXIST); + assert_true(val == 42); +} + +static void +test_d_getenv_uint64_t(void **state) +{ + uint64_t val = 0; + int rc = 0; + + getenv_return = "18446744073709551615"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT64_MAX); + + getenv_return = "42"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == 42); + + getenv_return = "18446744073709551616"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "012345678901234567890"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "-42"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "booo"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = "42booo"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = ""; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == 42); + + getenv_return = NULL; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_NONEXIST); + assert_true(val == 42); +} + +static void +test_d_setenv(void **state) +{ + char env[1]; + int rc; + + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_NONEXIST); + assert_false(d_isenv_def("foo")); + + rc = d_setenv("foo", "bar", 0); + assert_int_equal(rc, -DER_SUCCESS); + + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_TRUNC); + assert_true(d_isenv_def("foo")); + + rc = d_unsetenv("foo"); + assert_int_equal(rc, -DER_SUCCESS); + + rc = d_getenv_str(env, sizeof(env), "foo"); + assert_int_equal(rc, -DER_NONEXIST); + assert_false(d_isenv_def("foo")); +} + int main(int argc, char **argv) { @@ -2098,10 +2465,23 @@ main(int argc, char **argv) cmocka_unit_test(test_gurt_string_buffer), cmocka_unit_test(test_d_rank_list_dup_sort_uniq), cmocka_unit_test(test_hash_perf), - }; + cmocka_unit_test_setup_teardown(test_d_getenv_str, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_agetenv_str, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_getenv_bool, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_getenv_char, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_getenv_uint, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_getenv_uint32_t, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test_setup_teardown(test_d_getenv_uint64_t, setup_getenv_mocks, + teardown_getenv_mocks), + cmocka_unit_test(test_d_setenv)}; d_register_alt_assert(mock_assert); - return cmocka_run_group_tests_name("test_gurt", tests, init_tests, - fini_tests); + return cmocka_run_group_tests_name("test_gurt", tests, init_tests, fini_tests); } diff --git a/src/gurt/tests/test_gurt_telem_producer.c b/src/gurt/tests/test_gurt_telem_producer.c index bf3db9d19c9..0a1731c607d 100644 --- a/src/gurt/tests/test_gurt_telem_producer.c +++ b/src/gurt/tests/test_gurt_telem_producer.c @@ -15,9 +15,9 @@ #include #include #include "wrap_cmocka.h" -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_producer.h" -#include "gurt/telemetry_consumer.h" +#include +#include +#include #define STATS_EPSILON (0.00001) #define TEST_IDX (99) diff --git a/src/include/cart/api.h b/src/include/cart/api.h index 3aa87fccdaa..e76fb3433e3 100644 --- a/src/include/cart/api.h +++ b/src/include/cart/api.h @@ -2247,6 +2247,30 @@ crt_quiet_error(int err) return err == -DER_GRPVER; } +/** + * Change the quota limit. + * + * \param[in] crt_ctx CaRT context + * \param[in] quota Quota type + * \param[in] val Value + * + * \return DER_SUCCESS on success, negative value on + * failure. + */ +int crt_context_quota_limit_set(crt_context_t crt_ctx, crt_quota_type_t quota, int value); + +/** + * Query the quota limit. + * + * \param[in] crt_ctx CaRT context + * \param[in] quota Quota type + * \param[out] val Returned value + * + * \return DER_SUCCESS on success, negative value on + * failure. + */ +int crt_context_quota_limit_get(crt_context_t crt_ctx, crt_quota_type_t quota, int *value); + /** * Get the proto version of an RPC request. * diff --git a/src/include/cart/types.h b/src/include/cart/types.h index 0ce7dd79815..d02e2881047 100644 --- a/src/include/cart/types.h +++ b/src/include/cart/types.h @@ -440,6 +440,17 @@ typedef enum { CRT_GROUP_MOD_OP_COUNT, } crt_group_mod_op_t; +/** + * Quotas supported by CaRT. + */ +typedef enum { + /** Limit of number of inflight rpcs */ + CRT_QUOTA_RPCS, + + /** Total count of supported quotas */ + CRT_QUOTA_COUNT, +} crt_quota_type_t; + /** @} */ #endif /* __CRT_TYPES_H__ */ diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index 14b2337ea0f..765fa15e137 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -62,6 +62,8 @@ enum dtx_mbs_flags { * shard index to sort the dtx_memberships::dm_tgts. Obsolete. */ DMF_SORTED_SAD_IDX = (1 << 3), + /* The dtx target information are organized as dtx_coll_target. */ + DMF_COLL_TARGET = (1 << 4), }; /** @@ -128,6 +130,64 @@ struct dtx_redundancy_group { uint32_t drg_ids[0]; }; +/* + * How many targets are recorded in dtx_memberships::dm_tgts for collective DTX. The first one is + * current leader, the others are for new leader candicates in order when leader switched. + * + * For most of cases, when DTX leader switch happens, DTX resync will commit or abort related DTX. + * After that, related DTX dtx_memberships will become useless any longer and discarded. So unless + * the new leader is dead and excluded during current DTX resync, one new leader candidate will be + * enough. We record three new leader candidates, that can resolve the leader election trouble for + * twice when leader switch during DTX resync. + */ +#define DTX_COLL_INLINE_TARGETS 4 + +/** + * A collective transaction may contains a lot of participants. If we store all of them one by one + * in the dtx_memberships (MBS) structure, then the MBS body will be very large. Transferring such + * large MBS on network is inconvenient and may have to via RDAM instead of directly packed inside + * related RPC body. + * + * To avoid such bad situation, collective DTX will use dtx_coll_target. Instead of recording all + * the DTX participants information in MBS, the dtx_coll_target will record the targets reside on + * current engine, that can be used for local DTX operation (commit, abort, check). + * + * Please note that collective DTX only can be used for single object based stand alone operation. + * If current user is the collective DTX leader, and wants to operate the collective DTX on other + * DAOS engines, then it needs to re-calculate related participants based on related object layout. + * For most of commit/abort cases, the collective DTX leader has already prepared the paraticipants + * information in DRAM before starting the DTX, it is unnecessary to re-calculate the paraticipants. + * The re-calculation DTX paraticipants will happen when resync or cleanup the collective DTX. Such + * two cases are relative rare, so even if the overhead for such re-calculation would be quite high, + * it will not affect the whole system too much. + * + * On the other hand, DTX refresh is frequently used DTX logic. Efficiently find out the DTX leader + * is crucial for that. Consider DTX leader switch, we will record several new leader candidates in + * the MBS in front of the collective targets information. Then for most of cases, DTX refresh does + * not need to re-calculation DTX paraticipants. + */ +struct dtx_coll_target { + /* Fault domain level - used for generating related object layout. */ + uint32_t dct_fdom_lvl; + /* Performance domain affinity - used for generating related object layout. */ + uint32_t dct_pda; + /* Performance domain level - used for generating related object layout. */ + uint32_t dct_pdom_lvl; + /* The object layout version - used for generating related object layout. */ + uint16_t dct_layout_ver; + /* How many shards on current engine that participant in the collective DTX. */ + uint8_t dct_tgt_nr; + /* The size of dct_bitmap. */ + uint8_t dct_bitmap_sz; + /* + * The ID (pool_component::co_id) array for targets on current engine, used for DTX check. + * The bitmap for local object shards on current engine is appended after the ID array. The + * bitmap is used for DTX commit and abort. In fact, we can re-calculate such bitmap based + * on the taregets ID, but directly store the bitmap is more efficient since it is not big. + */ + uint32_t dct_tgts[0]; +}; + struct dtx_memberships { /* How many touched shards in the DTX. */ uint32_t dm_tgt_cnt; @@ -153,7 +213,8 @@ struct dtx_memberships { }; /* The first 'sizeof(struct dtx_daos_target) * dm_tgt_cnt' is the - * dtx_daos_target array. The subsequent are modification groups. + * dtx_daos_target array. The subsequent can be redundancy groups + * or dtx_coll_target, depends on dm_flags. */ union { char dm_data[0]; diff --git a/src/include/daos/object.h b/src/include/daos/object.h index 71d37facac0..60e7df575f8 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -206,6 +206,84 @@ struct daos_shard_tgt { uint8_t st_flags; /* see daos_tgt_flags */ }; +struct daos_coll_shard { + uint16_t dcs_nr; + uint16_t dcs_cap; + uint32_t dcs_inline; + /* The shards (ID) in the buffer locate on the same VOS target. */ + uint32_t *dcs_buf; + + /* + * Index (in layout) of the first shard corresponding to "dcs_buf[0]" on this target, + * do not pack on-wire. + */ + uint32_t dcs_idx; +}; + +struct daos_coll_target { + uint32_t dct_rank; + /* + * The size (in byte) of dct_bitmap. It (s << 3) may be smaller than dss_tgt_nr if only + * some VOS targets are involved. It also maybe larger than dss_tgt_nr if dss_tgt_nr is + * not 2 ^ n aligned. + */ + uint8_t dct_bitmap_sz; + /* The max shard in dct_shards, it may be smaller than the sparse array length. */ + uint8_t dct_max_shard; + /* + * How many valid object shards reside on the engine. If the real count exceeds the + * max capacity of sizeof(uint8_t) can hold, just set as the max. That is no matter. + */ + uint8_t dct_tgt_nr; + /* + * The capacity for the dct_tgt_ids array. + * For non-modification case, it is always zero to avoid sending dct_tgt_ids on wire. + */ + uint8_t dct_tgt_cap; + /* Bitmap for the VOS targets (on the rank) that are involved in the operation. */ + uint8_t *dct_bitmap; + /* Sparse array for object shards' identifiers, sorted with VOS targets index. */ + struct daos_coll_shard *dct_shards; + /* + * It stores the identifiers of shards on the engine, in spite of on which VOS target, + * only for modification case. + */ + uint32_t *dct_tgt_ids; +}; + +static inline void +daos_coll_shard_cleanup(struct daos_coll_shard *shards, uint32_t count) +{ + struct daos_coll_shard *shard; + int i; + + if (shards != NULL) { + for (i = 0; i < count; i++) { + shard = &shards[i]; + if (shard->dcs_buf != &shard->dcs_inline) + D_FREE(shard->dcs_buf); + } + D_FREE(shards); + } +} + +static inline void +daos_coll_target_cleanup(struct daos_coll_target *dcts, uint32_t count) +{ + struct daos_coll_target *dct; + int i; + + if (dcts != NULL) { + for (i = 0; i < count; i++) { + dct = &dcts[i]; + daos_coll_shard_cleanup(dct->dct_shards, dct->dct_max_shard + 1); + D_FREE(dct->dct_bitmap); + D_FREE(dct->dct_tgt_ids); + } + D_FREE(dcts); + } +} + static inline bool daos_oid_is_null(daos_obj_id_t oid) { diff --git a/src/include/daos/placement.h b/src/include/daos/placement.h index d48be639f8c..72c3d4254d9 100644 --- a/src/include/daos/placement.h +++ b/src/include/daos/placement.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -63,7 +63,9 @@ struct pl_obj_shard { uint32_t po_shard; /* shard identifier */ uint32_t po_target; /* target id */ uint32_t po_fseq; /* The latest failure sequence */ - uint32_t po_rebuilding:1, /* rebuilding status */ + uint16_t po_rank; /* The rank on which the shard exists */ + uint8_t po_index; /* The target index inside the node */ + uint8_t po_rebuilding:1, /* rebuilding status */ po_reintegrating:1; /* reintegrating status */ }; diff --git a/src/include/daos_errno.h b/src/include/daos_errno.h index 2eca16d371e..f3a3cdddcdf 100644 --- a/src/include/daos_errno.h +++ b/src/include/daos_errno.h @@ -117,7 +117,9 @@ extern "C" { /** Invalid user/group permissions.*/ \ ACTION(DER_SHMEM_PERMS, Unable to access shared memory segment due to incompatible user or group permissions) \ /** Fatal (non-retry-able) transport layer mercury error */ \ - ACTION(DER_HG_FATAL, Fatal transport layer mercury error) + ACTION(DER_HG_FATAL, Fatal transport layer mercury error) \ + /** Quota limit reached on the requested resource */ \ + ACTION(DER_QUOTA_LIMIT, Quota limit reached) /** TODO: add more error numbers */ /** Preprocessor macro defining DAOS errno values and internal definition of d_errstr */ diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 7664dbe0ca6..daa9184e707 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -63,6 +63,7 @@ struct ds_cont_child { ABT_mutex sc_mutex; ABT_cond sc_dtx_resync_cond; ABT_cond sc_scrub_cond; + ABT_cond sc_rebuild_cond; uint32_t sc_dtx_resyncing:1, sc_dtx_reindex:1, sc_dtx_reindex_abort:1, @@ -71,7 +72,8 @@ struct ds_cont_child { sc_stopping:1, sc_vos_agg_active:1, sc_ec_agg_active:1, - sc_scrubbing:1; + sc_scrubbing:1, + sc_rebuilding:1; uint32_t sc_dtx_batched_gen; /* Tracks the schedule request for aggregation ULT */ struct sched_request *sc_agg_req; @@ -97,7 +99,8 @@ struct ds_cont_child { uint32_t sc_snapshots_nr; uint32_t sc_open; - uint64_t sc_dtx_committable_count; + uint32_t sc_dtx_committable_count; + uint32_t sc_dtx_committable_coll_count; /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can @@ -121,8 +124,10 @@ struct ds_cont_child { daos_handle_t sc_dtx_cos_hdl; /* The DTX COS-btree. */ struct btr_root sc_dtx_cos_btr; - /* The global list for committable DTXs. */ + /* The global list for committable non-collective DTXs. */ d_list_t sc_dtx_cos_list; + /* The global list for committable collective DTXs. */ + d_list_t sc_dtx_coll_list; /* the pool map version of updating DAOS_PROP_CO_STATUS prop */ uint32_t sc_status_pm_ver; /* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */ @@ -185,7 +190,8 @@ void ds_cont_child_stop_all(struct ds_pool_child *pool_child); int ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid, struct ds_cont_child **ds_cont); - +void +ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child); /** initialize a csummer based on container properties. Will retrieve the * checksum related properties from IV */ diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index be491483fbc..1b715f91b18 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -512,6 +512,8 @@ enum dss_ult_flags { DSS_ULT_FL_PERIODIC = (1 << 0), /* Use DSS_DEEP_STACK_SZ as the stack size */ DSS_ULT_DEEP_STACK = (1 << 1), + /* Use current ULT (instead of creating new one) for the task. */ + DSS_USE_CURRENT_ULT = (1 << 2), }; int dss_ult_create(void (*func)(void *), void *arg, int xs_type, int tgt_id, @@ -581,8 +583,14 @@ struct dss_coll_args { /** Arguments for dss_collective func (Mandatory) */ void *ca_func_args; void *ca_aggregator; - int *ca_exclude_tgts; - unsigned int ca_exclude_tgts_cnt; + /* Specify on which targets to execute the task. */ + uint8_t *ca_tgt_bitmap; + /* + * The size (in byte) of ca_tgt_bitmap. It may be smaller than dss_tgt_nr if only some + * VOS targets are involved. It also may be larger than dss_tgt_nr if dss_tgt_nr is not + * 2 ^ n aligned. + */ + uint32_t ca_tgt_bitmap_sz; /** Stream arguments for all streams */ struct dss_coll_stream_args ca_stream_args; }; @@ -604,6 +612,8 @@ dss_thread_collective_reduce(struct dss_coll_ops *ops, unsigned int flags); int dss_task_collective(int (*func)(void *), void *arg, unsigned int flags); int dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags); +int dss_build_coll_bitmap(int *exclude_tgts, uint32_t exclude_cnt, uint8_t **p_bitmap, + uint32_t *bitmap_sz); /** * Loaded module management metholds diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index ee78ffe3ec9..f1ee94ff2e4 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -23,6 +23,7 @@ struct dtx_share_peer { daos_epoch_t dsp_epoch; uint64_t dsp_dkey_hash; int dsp_status; + uint32_t dsp_version; uint32_t dsp_inline_mbs:1; struct dtx_memberships *dsp_mbs; }; @@ -64,7 +65,6 @@ struct dtx_handle { dth_pinned:1, /* DTXs in CoS list are committed. */ dth_cos_done:1, - dth_resent:1, /* For resent case. */ /* Only one participator in the DTX. */ dth_solo:1, /* Do not keep committed entry. */ @@ -141,9 +141,21 @@ struct dtx_handle { struct dtx_sub_status { struct daos_shard_tgt dss_tgt; int dss_result; + uint32_t dss_version; uint32_t dss_comp:1; }; +struct dtx_coll_entry { + struct dtx_id dce_xid; + uint32_t dce_ver; + uint32_t dce_refs; + d_rank_list_t *dce_ranks; + uint8_t *dce_hints; + uint8_t *dce_bitmap; + uint32_t dce_hint_sz; + uint32_t dce_bitmap_sz; +}; + struct dtx_leader_handle; typedef int (*dtx_agg_cb_t)(struct dtx_leader_handle *dlh, int allow_failure); @@ -153,7 +165,10 @@ struct dtx_leader_handle { struct dtx_handle dlh_handle; /* result for the distribute transaction */ int dlh_result; - + /* The known latest pool map version from remote targets. */ + uint32_t dlh_rmt_ver; + /* For 64-bits alignment. */ + uint32_t dlh_padding; /* The array of the DTX COS entries */ uint32_t dlh_dti_cos_count; struct dtx_id *dlh_dti_cos; @@ -165,8 +180,14 @@ struct dtx_leader_handle { int32_t dlh_allow_failure; /* Normal sub requests have been processed. */ uint32_t dlh_normal_sub_done:1, + /* For collective DTX. */ + dlh_coll:1, + /* Only forward RPC, but neither commit nor abort DTX. */ + dlh_relay:1, /* Drop conditional flags when forward RPC. */ dlh_drop_cond:1; + /* Elements for collective DTX. */ + struct dtx_coll_entry *dlh_coll_entry; /* How many normal sub request. */ uint32_t dlh_normal_sub_cnt; /* How many delay forward sub request. */ @@ -180,7 +201,8 @@ struct dtx_leader_handle { }; struct dtx_stat { - uint64_t dtx_committable_count; + uint32_t dtx_committable_count; + uint32_t dtx_committable_coll_count; uint64_t dtx_oldest_committable_time; uint64_t dtx_oldest_active_time; /* The epoch for the oldest entry in the 1st committed blob. */ @@ -206,7 +228,7 @@ enum dtx_flags { DTX_FOR_MIGRATION = (1 << 3), /** Ignore other uncommitted DTXs. */ DTX_IGNORE_UNCOMMITTED = (1 << 4), - /** Resent request. */ + /** Resent request. Out-of-date. */ DTX_RESEND = (1 << 5), /** Force DTX refresh if hit non-committed DTX on non-leader. Out-of-date DAOS-7878. */ DTX_FORCE_REFRESH = (1 << 6), @@ -214,6 +236,10 @@ enum dtx_flags { DTX_PREPARED = (1 << 7), /** Do not keep committed entry. */ DTX_DROP_CMT = (1 << 8), + /* The non-leader targets are collective. */ + DTX_TGT_COLL = (1 << 9), + /* Not real DTX leader, Only forward IO to others, but neither commit nor abort DTX. */ + DTX_RELAY = (1 << 10), }; void @@ -221,12 +247,11 @@ dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth); int dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash); int -dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, - struct dtx_epoch *epoch, uint16_t sub_modification_cnt, - uint32_t pm_ver, daos_unit_oid_t *leader_oid, - struct dtx_id *dti_cos, int dti_cos_cnt, - struct daos_shard_tgt *tgts, int tgt_cnt, uint32_t flags, - struct dtx_memberships *mbs, struct dtx_leader_handle **p_dlh); +dtx_leader_begin(daos_handle_t coh, struct dtx_id *dti, struct dtx_epoch *epoch, + uint16_t sub_modification_cnt, uint32_t pm_ver, daos_unit_oid_t *leader_oid, + struct dtx_id *dti_cos, int dti_cos_cnt, struct daos_shard_tgt *tgts, int tgt_cnt, + uint32_t flags, struct dtx_memberships *mbs, struct dtx_coll_entry *dce, + struct dtx_leader_handle **p_dlh); int dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int result); @@ -261,10 +286,19 @@ void dtx_cont_deregister(struct ds_cont_child *cont); int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); +int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, + struct dtx_cos_key *dcks, int count); + int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); +int +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck); + +int +dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); + /** * Check whether the given DTX is resent one or not. * @@ -290,6 +324,24 @@ int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); int dtx_handle_resend(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, uint32_t *pm_ver); +static inline struct dtx_coll_entry * +dtx_coll_entry_get(struct dtx_coll_entry *dce) +{ + dce->dce_refs++; + return dce; +} + +static inline void +dtx_coll_entry_put(struct dtx_coll_entry *dce) +{ + if (dce != NULL && --(dce->dce_refs) == 0) { + d_rank_list_free(dce->dce_ranks); + D_FREE(dce->dce_bitmap); + D_FREE(dce->dce_hints); + D_FREE(dce); + } +} + static inline void dtx_dsp_free(struct dtx_share_peer *dsp) { @@ -306,7 +358,12 @@ dtx_entry_get(struct dtx_entry *dte) return dte; } -void dtx_entry_put(struct dtx_entry *dte); +static inline void +dtx_entry_put(struct dtx_entry *dte) +{ + if (--(dte->dte_refs) == 0) + D_FREE(dte); +} static inline bool dtx_is_valid_handle(const struct dtx_handle *dth) diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 9d6035f66aa..390f4f4e5ec 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -103,12 +103,16 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, * * \param coh [IN] Container open handle. * \param dti [IN] Pointer to the DTX identifier. + * \param oid [OUT] Pointer to the ID for the DTX leader object shard. * \param mbs [OUT] Pointer to the DTX participants information. * - * \return Zero on success, negative value if error. + * \return Zero on success. + * Positive if DTX has been committed. + * Negative value if error. */ int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs); +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs); /** * Commit the specified DTXs. diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index 4e80bacae4e..9c9999ba97c 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -26,15 +26,6 @@ struct dtx_rsrvd_uint { d_list_t dru_nvme; }; -enum dtx_cos_flags { - DCF_SHARED = (1 << 0), - /* Some DTX (such as for the distributed transaction across multiple - * RDGs, or for EC object modification) need to be committed via DTX - * RPC instead of piggyback via other dispatched update/punch RPC. - */ - DCF_EXP_CMT = (1 << 1), -}; - enum dtx_stat_flags { /* Skip bad DTX entries (such as corruptted ones) when stat. */ DSF_SKIP_BAD = (1 << 1), diff --git a/src/include/dfuse_ioctl.h b/src/include/dfuse_ioctl.h index ac23fbbcff2..18b9b1fdb95 100644 --- a/src/include/dfuse_ioctl.h +++ b/src/include/dfuse_ioctl.h @@ -7,7 +7,7 @@ #define __DFUSE_IOCTL_H__ #include -#include "daos.h" +#include #define DFUSE_IOCTL_TYPE 0xA3 /* Arbitrary "unique" type of the IOCTL */ #define DFUSE_IOCTL_REPLY_BASE 0xC1 /* Number of the IOCTL. Also arbitrary */ diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index d05cbedd2c5..779a547768b 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -574,12 +574,44 @@ d_sgl_buf_copy(d_sg_list_t *dst_sgl, d_sg_list_t *src_sgl) } } -void d_getenv_bool(const char *env, bool *bool_val); -void d_getenv_char(const char *env, char *char_val); -void d_getenv_int(const char *env, unsigned int *int_val); -int d_getenv_uint64_t(const char *env, uint64_t *val); -int d_write_string_buffer(struct d_string_buffer_t *buf, const char *fmt, ...); -void d_free_string(struct d_string_buffer_t *buf); +bool +d_isenv_def(char *name); +int +d_getenv_str(char *str_val, size_t str_size, const char *name); +int +d_agetenv_str(char **str_val, const char *name); +void +d_free_env_str(char **str_val); +int +d_getenv_bool(const char *name, bool *bool_val); +int +d_getenv_char(const char *name, char *char_val); +int +d_getenv_uint(const char *name, unsigned int *uint_val); +int +d_getenv_uint32_t(const char *name, uint32_t *uint32_val); +int +d_getenv_uint64_t(const char *name, uint64_t *uint64_val); +int +d_putenv(char *name); +int +d_setenv(const char *name, const char *value, int overwrite); +int +d_unsetenv(const char *name); +int +d_clearenv(void); + +static inline int +d_getenv_int(const char *name, unsigned int *uint_val) +{ + D_WARN("d_getenv_int() is deprecated, please use d_getenv_uint()"); + return d_getenv_uint(name, uint_val); +} + +int +d_write_string_buffer(struct d_string_buffer_t *buf, const char *fmt, ...); +void +d_free_string(struct d_string_buffer_t *buf); typedef void (*d_alloc_track_cb_t)(void *arg, size_t size); diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index fc8eeb45493..57cf0faa723 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -480,7 +480,7 @@ int dc_mgmt_net_cfg(const char *name) continue; } - rc = setenv(v_name, v_value, 0); + rc = d_setenv(v_name, v_value, 0); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); D_DEBUG(DB_MGMT, "set server-supplied client env: %s", env); @@ -491,19 +491,19 @@ int dc_mgmt_net_cfg(const char *name) g_num_serv_ranks = resp->n_rank_uris; D_INFO("Setting number of server ranks to %d\n", g_num_serv_ranks); /* These two are always set */ - rc = setenv("CRT_PHY_ADDR_STR", info.provider, 1); + rc = d_setenv("CRT_PHY_ADDR_STR", info.provider, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); sprintf(buf, "%d", info.crt_ctx_share_addr); - rc = setenv("CRT_CTX_SHARE_ADDR", buf, 1); + rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); /* If the server has set this, the client must use the same value. */ if (info.srv_srx_set != -1) { sprintf(buf, "%d", info.srv_srx_set); - rc = setenv("FI_OFI_RXM_USE_SRX", buf, 1); + rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); D_INFO("Using server's value for FI_OFI_RXM_USE_SRX: %s\n", @@ -522,7 +522,7 @@ int dc_mgmt_net_cfg(const char *name) crt_timeout = getenv("CRT_TIMEOUT"); if (!crt_timeout) { sprintf(buf, "%d", info.crt_timeout); - rc = setenv("CRT_TIMEOUT", buf, 1); + rc = d_setenv("CRT_TIMEOUT", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { @@ -533,7 +533,7 @@ int dc_mgmt_net_cfg(const char *name) ofi_interface = getenv("OFI_INTERFACE"); ofi_domain = getenv("OFI_DOMAIN"); if (!ofi_interface) { - rc = setenv("OFI_INTERFACE", info.interface, 1); + rc = d_setenv("OFI_INTERFACE", info.interface, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -545,7 +545,7 @@ int dc_mgmt_net_cfg(const char *name) D_WARN("Ignoring OFI_DOMAIN '%s' because OFI_INTERFACE is not set; using " "automatic configuration instead\n", ofi_domain); - rc = setenv("OFI_DOMAIN", info.domain, 1); + rc = d_setenv("OFI_DOMAIN", info.domain, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { diff --git a/src/mgmt/srv_query.c b/src/mgmt/srv_query.c index e33d3b71ca2..7bfd31f85d8 100644 --- a/src/mgmt/srv_query.c +++ b/src/mgmt/srv_query.c @@ -324,6 +324,17 @@ copy_str2ctrlr(char **dst, const char *src) return 0; } +static void +ctrlr_reset_str_fields(Ctl__NvmeController *ctrlr) +{ + ctrlr->pci_addr = NULL; + ctrlr->model = NULL; + ctrlr->serial = NULL; + ctrlr->fw_rev = NULL; + ctrlr->vendor_id = NULL; + ctrlr->pci_dev_type = NULL; +} + static int add_ctrlr_details(Ctl__NvmeController *ctrlr, struct bio_dev_info *dev_info) { @@ -429,6 +440,8 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) break; } ctl__nvme_controller__init(resp->devices[i]->ctrlr); + /* Set string fields to NULL to allow D_FREE to work as expected on cleanup */ + ctrlr_reset_str_fields(resp->devices[i]->ctrlr); rc = add_ctrlr_details(resp->devices[i]->ctrlr, dev_info); if (rc != 0) @@ -459,6 +472,7 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) resp->devices[i]->ctrlr->namespaces[0]->id = dev_info->bdi_ctrlr->nss->id; resp->devices[i]->ctrlr->namespaces[0]->size = dev_info->bdi_ctrlr->nss->size; + resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id; D_DEBUG(DB_MGMT, "ns id/size: '%d' '%ld'\n", resp->devices[i]->ctrlr->namespaces[0]->id, @@ -712,7 +726,6 @@ ds_mgmt_dev_set_faulty(uuid_t dev_uuid, Ctl__DevManageResp *resp) } ctl__smd_device__init(resp->device); resp->device->uuid = NULL; - // resp->device->dev_state = CTL__NVME_DEV_STATE__EVICTED; D_ALLOC(resp->device->uuid, DAOS_UUID_STR_SIZE); if (resp->device->uuid == NULL) { @@ -738,7 +751,6 @@ ds_mgmt_dev_set_faulty(uuid_t dev_uuid, Ctl__DevManageResp *resp) DL_ERROR(rc, "FAULT LED state not set on device:" DF_UUID, DP_UUID(dev_uuid)); } - // resp->device->led_state = led_state; out: smd_dev_free_info(dev_info); @@ -767,11 +779,13 @@ ds_mgmt_dev_manage_led(Ctl__LedManageReq *req, Ctl__DevManageResp *resp) return -DER_NOMEM; } ctl__nvme_controller__init(resp->device->ctrlr); + /* Set string fields to NULL to allow D_FREE to work as expected on cleanup */ + ctrlr_reset_str_fields(resp->device->ctrlr); D_ALLOC(resp->device->ctrlr->pci_addr, ADDR_STR_MAX_LEN + 1); if (resp->device->ctrlr->pci_addr == NULL) return -DER_NOMEM; - if ((req->ids == NULL) || (strlen(req->ids) == 0)) { + if ((req->ids == NULL) || (strnlen(req->ids, ADDR_STR_MAX_LEN) == 0)) { D_ERROR("PCI address not provided in request\n"); return -DER_INVAL; } diff --git a/src/object/cli_mod.c b/src/object/cli_mod.c index 79c13fee948..336c43253c8 100644 --- a/src/object/cli_mod.c +++ b/src/object/cli_mod.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -16,6 +16,9 @@ #include "obj_rpc.h" #include "obj_internal.h" +#define OBJ_COLL_PUNCH_THD_MIN 31 + +unsigned int obj_coll_punch_thd; unsigned int srv_io_mode = DIM_DTX_FULL_ENABLED; int dc_obj_proto_version; @@ -68,6 +71,16 @@ dc_obj_init(void) D_GOTO(out_class, rc); } + obj_coll_punch_thd = OBJ_COLL_PUNCH_THD_MIN; + d_getenv_uint("DAOS_OBJ_COLL_PUNCH_THD", &obj_coll_punch_thd); + if (obj_coll_punch_thd < OBJ_COLL_PUNCH_THD_MIN) { + D_WARN("Invalid collective punch threshold %u, it cannot be smaller than %u, " + "use the default value %u\n", obj_coll_punch_thd, + OBJ_COLL_PUNCH_THD_MIN, OBJ_COLL_PUNCH_THD_MIN); + obj_coll_punch_thd = OBJ_COLL_PUNCH_THD_MIN; + } + D_INFO("Set object collective punch threshold as %u\n", obj_coll_punch_thd); + tx_verify_rdg = false; d_getenv_bool("DAOS_TX_VERIFY_RDG", &tx_verify_rdg); D_INFO("%s TX redundancy group verification\n", tx_verify_rdg ? "Enable" : "Disable"); diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 710b94d18a7..088e87067c4 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -356,6 +356,8 @@ obj_layout_create(struct dc_object *obj, unsigned int mode, bool refresh) obj_shard->do_fseq = layout->ol_shards[i].po_fseq; obj_shard->do_rebuilding = layout->ol_shards[i].po_rebuilding; obj_shard->do_reintegrating = layout->ol_shards[i].po_reintegrating; + obj_shard->do_target_rank = layout->ol_shards[i].po_rank; + obj_shard->do_target_idx = layout->ol_shards[i].po_index; } out: if (layout) @@ -2344,6 +2346,98 @@ check_query_flags(daos_obj_id_t oid, uint32_t flags, daos_key_t *dkey, return 0; } +static int +obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool for_modify) +{ + struct dc_pool *pool = obj->cob_pool; + uint32_t node_nr; + int rc = 0; + + D_ASSERT(pool != NULL); + D_ASSERT(coa->coa_dcts == NULL); + + D_RWLOCK_RDLOCK(&pool->dp_map_lock); + node_nr = pool_map_node_nr(pool->dp_map); + D_RWLOCK_UNLOCK(&pool->dp_map_lock); + + D_ALLOC_ARRAY(coa->coa_dcts, node_nr); + if (coa->coa_dcts == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* + * Set coa_dct_nr as -1 to indicate that the coa_dcts array may be sparse until + * obj_coll_oper_args_collapse(). That is useful for obj_coll_oper_args_fini(). + */ + coa->coa_dct_nr = -1; + coa->coa_dct_cap = node_nr; + coa->coa_max_dct_sz = 0; + coa->coa_max_shard_nr = 0; + coa->coa_max_bitmap_sz = 0; + coa->coa_target_nr = 0; + coa->coa_for_modify = for_modify ? 1 : 0; + +out: + return rc; +} + +static void +obj_coll_oper_args_fini(struct coll_oper_args *coa) +{ + daos_coll_target_cleanup(coa->coa_dcts, + coa->coa_dct_nr < 0 ? coa->coa_dct_cap : coa->coa_dct_nr); + coa->coa_dcts = NULL; + coa->coa_dct_cap = 0; + coa->coa_dct_nr = 0; +} + +static int +obj_coll_oper_args_collapse(struct coll_oper_args *coa, uint32_t *size) +{ + struct daos_coll_target *dct; + struct daos_coll_shard *dcs; + uint32_t dct_size; + int rc = 0; + int i; + int j; + + for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + dct = &coa->coa_dcts[i]; + if (dct->dct_bitmap != NULL) { + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (j = 0; j <= dct->dct_max_shard; j++) { + dcs = &dct->dct_shards[j]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (coa->coa_dct_nr < i) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; + } + } + + if (unlikely(coa->coa_dct_nr == 0)) + /* If all shards are NONEXIST, then need not to send RPC(s). */ + rc = 1; + else if (coa->coa_dct_cap > coa->coa_dct_nr) + /* Reset the other dct slots to avoid double free during cleanup. */ + memset(&coa->coa_dcts[coa->coa_dct_nr], 0, + sizeof(*dct) * (coa->coa_dct_cap - coa->coa_dct_nr)); + + return rc; +} + static inline bool obj_key_valid(daos_obj_id_t oid, daos_key_t *key, bool check_dkey) { @@ -2843,6 +2937,7 @@ obj_embedded_shard_arg(struct obj_auxi_args *obj_auxi) case DAOS_OBJ_RPC_SYNC: return &obj_auxi->s_args.sa_auxi; case DAOS_OBJ_RPC_QUERY_KEY: + case DAOS_OBJ_RPC_COLL_PUNCH: /* * called from obj_comp_cb_internal() and * checked in obj_shard_comp_cb() correctly @@ -4823,6 +4918,9 @@ obj_comp_cb(tse_task_t *task, void *data) } } + if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) + obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); + if ((!obj_auxi->no_retry || task->dt_result == -DER_FETCH_AGAIN) && (pm_stale || obj_auxi->io_retry)) { rc = obj_retry_cb(task, obj, obj_auxi, pm_stale, &io_task_reinited); @@ -4868,6 +4966,7 @@ obj_comp_cb(tse_task_t *task, void *data) dc_tx_attach(obj_auxi->th, obj, DAOS_OBJ_RPC_FETCH, task, 0, false); break; } + case DAOS_OBJ_RPC_COLL_PUNCH: case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_PUNCH_DKEYS: case DAOS_OBJ_RPC_PUNCH_AKEYS: @@ -6641,18 +6740,9 @@ shard_punch_prep(struct shard_auxi_args *shard_auxi, struct dc_object *obj, struct obj_auxi_args *obj_auxi, uint32_t grp_idx) { struct shard_punch_args *shard_arg; - uuid_t coh_uuid; - uuid_t cont_uuid; - int rc; - - rc = dc_cont2uuid(obj->cob_co, &coh_uuid, &cont_uuid); - if (rc != 0) - return rc; shard_arg = container_of(shard_auxi, struct shard_punch_args, pa_auxi); - shard_arg->pa_opc = obj_auxi->opc; - uuid_copy(shard_arg->pa_coh_uuid, coh_uuid); - uuid_copy(shard_arg->pa_cont_uuid, cont_uuid); + shard_arg->pa_opc = obj_auxi->opc; if (daos_handle_is_inval(obj_auxi->th)) daos_dti_gen(&shard_arg->pa_dti, @@ -6663,6 +6753,434 @@ shard_punch_prep(struct shard_auxi_args *shard_auxi, struct dc_object *obj, return 0; } +static int +obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, + uint32_t map_ver, uint32_t idx) +{ + struct dc_obj_shard *shard = NULL; + struct daos_coll_target *dct; + struct daos_coll_shard *dcs; + uint32_t *tmp; + uint8_t *new_bm; + int size; + int rc = 0; + int i; + + rc = obj_shard_open(obj, idx, map_ver, &shard); + if (rc == -DER_NONEXIST) + D_GOTO(out, rc = 0); + + if (rc != 0 || (shard->do_rebuilding && !coa->coa_for_modify)) + goto out; + + /* More ranks joined after obj_coll_oper_args_init(). */ + if (unlikely(shard->do_target_rank >= coa->coa_dct_cap)) { + D_REALLOC_ARRAY(dct, coa->coa_dcts, coa->coa_dct_cap, shard->do_target_rank + 2); + if (dct == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + coa->coa_dcts = dct; + coa->coa_dct_cap = shard->do_target_rank + 2; + } + + dct = &coa->coa_dcts[shard->do_target_rank]; + dct->dct_rank = shard->do_target_rank; + + if (shard->do_target_idx >= dct->dct_bitmap_sz << 3) { + size = (shard->do_target_idx >> 3) + 1; + + D_ALLOC_ARRAY(dcs, size << 3); + if (dcs == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + if (dct->dct_shards != NULL) { + memcpy(dcs, dct->dct_shards, sizeof(*dcs) * (dct->dct_max_shard + 1)); + for (i = 0; i <= dct->dct_max_shard; i++) { + if (dcs[i].dcs_nr == 1) + dcs[i].dcs_buf = &dcs[i].dcs_inline; + } + D_FREE(dct->dct_shards); + } + dct->dct_shards = dcs; + + D_REALLOC(new_bm, dct->dct_bitmap, dct->dct_bitmap_sz, size); + if (new_bm == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dct->dct_bitmap = new_bm; + dct->dct_bitmap_sz = size; + } + + dcs = &dct->dct_shards[shard->do_target_idx]; + + if (unlikely(isset(dct->dct_bitmap, shard->do_target_idx))) { + /* More than one shards reside on the same VOS target. */ + D_ASSERT(dcs->dcs_nr >= 1); + + if (dcs->dcs_nr >= dcs->dcs_cap) { + D_ALLOC_ARRAY(tmp, dcs->dcs_nr << 1); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(tmp, dcs->dcs_buf, sizeof(*tmp) * dcs->dcs_nr); + if (dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + dcs->dcs_buf = tmp; + dcs->dcs_cap = dcs->dcs_nr << 1; + } + } else { + D_ASSERT(dcs->dcs_nr == 0); + + dcs->dcs_idx = idx; + dcs->dcs_buf = &dcs->dcs_inline; + setbit(dct->dct_bitmap, shard->do_target_idx); + if (dct->dct_max_shard < shard->do_target_idx) + dct->dct_max_shard = shard->do_target_idx; + } + + dcs->dcs_buf[dcs->dcs_nr++] = shard->do_id.id_shard; + + if (unlikely(dct->dct_tgt_nr == (uint8_t)(-1))) + goto out; + + if (coa->coa_for_modify) { + if (dct->dct_tgt_nr >= dct->dct_tgt_cap) { + if (dct->dct_tgt_cap == 0) + size = 4; + else if (dct->dct_tgt_cap <= 8) + size = dct->dct_tgt_cap << 1; + else + size = dct->dct_tgt_cap + 8; + + D_REALLOC_ARRAY(tmp, dct->dct_tgt_ids, dct->dct_tgt_cap, size); + if (tmp == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dct->dct_tgt_ids = tmp; + dct->dct_tgt_cap = size; + } + + /* + * There may be repeated elements in the dct->dct_tgt_ids array because multiple + * object shards reside on the same VOS target. It is no matter to store them in + * DTX MBS. Related DTX check logic will handle that. + */ + dct->dct_tgt_ids[dct->dct_tgt_nr++] = shard->do_target_id; + if (coa->coa_max_shard_nr < dct->dct_tgt_nr) + coa->coa_max_shard_nr = dct->dct_tgt_nr; + + if (coa->coa_target_nr < DTX_COLL_INLINE_TARGETS && + !shard->do_rebuilding && !shard->do_reintegrating) + coa->coa_targets[coa->coa_target_nr++] = shard->do_target_id; + + if (coa->coa_max_bitmap_sz < dct->dct_bitmap_sz) + coa->coa_max_bitmap_sz = dct->dct_bitmap_sz; + } else { + /* "dct_tgt_cap" is zero, then will not send dct_tgt_ids to server. */ + dct->dct_tgt_nr++; + } + +out: + if (shard != NULL) + obj_shard_close(shard); + + return rc; +} + +struct obj_coll_punch_cb_args { + unsigned char *cpca_buf; + struct dtx_memberships *cpca_mbs; + struct dc_obj_shard *cpca_shard; + crt_bulk_t *cpca_bulks; + crt_proc_t cpca_proc; + d_sg_list_t cpca_sgl; + d_iov_t cpca_iov; +}; + +static int +dc_obj_coll_punch_cb(tse_task_t *task, void *data) +{ + struct obj_coll_punch_cb_args *cpca = data; + + if (cpca->cpca_bulks != NULL) { + if (cpca->cpca_bulks[0] != CRT_BULK_NULL) + crt_bulk_free(cpca->cpca_bulks[0]); + D_FREE(cpca->cpca_bulks); + } + + if (cpca->cpca_proc != NULL) + crt_proc_destroy(cpca->cpca_proc); + + D_FREE(cpca->cpca_mbs); + D_FREE(cpca->cpca_buf); + obj_shard_close(cpca->cpca_shard); + + return 0; +} + +static int +dc_obj_coll_punch_mbs(struct coll_oper_args *coa, struct dc_object *obj, uint32_t leader_id, + struct dtx_memberships **p_mbs) +{ + struct dtx_memberships *mbs; + struct dtx_daos_target *ddt; + struct dtx_coll_target *dct; + int rc = 0; + int i; + int j; + + D_ALLOC(mbs, sizeof(*mbs) + sizeof(*ddt) * coa->coa_target_nr + sizeof(*dct)); + if (mbs == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* + * For object collective punch, even if we lost some redundancy groups when DTX resync, + * we still continue to punch remaining shards. So let's set dm_grp_cnt as 1 to bypass + * redundancy group check. + */ + mbs->dm_grp_cnt = 1; + mbs->dm_tgt_cnt = coa->coa_target_nr; + mbs->dm_data_size = sizeof(*ddt) * coa->coa_target_nr + sizeof(*dct); + mbs->dm_flags = DMF_CONTAIN_LEADER | DMF_COLL_TARGET; + + /* ddt[0] will be the lead target. */ + ddt = &mbs->dm_tgts[0]; + ddt[0].ddt_id = leader_id; + + for (i = 0, j = 1; i < coa->coa_target_nr && j < coa->coa_target_nr; i++) { + if (coa->coa_targets[i] != ddt[0].ddt_id) + ddt[j++].ddt_id = coa->coa_targets[i]; + } + + dct = (struct dtx_coll_target *)(ddt + coa->coa_target_nr); + dct->dct_fdom_lvl = obj->cob_md.omd_fdom_lvl; + dct->dct_pda = obj->cob_md.omd_pda; + dct->dct_pdom_lvl = obj->cob_md.omd_pdom_lvl; + dct->dct_layout_ver = obj->cob_layout_version; + + /* The other fields will not be packed on-wire. Related engine will fill them in future. */ + + *p_mbs = mbs; + +out: + return rc; +} + +static int +dc_obj_coll_punch_bulk(tse_task_t *task, struct coll_oper_args *coa, + struct obj_coll_punch_cb_args *cpca, uint32_t *p_size) +{ + /* The proc function may pack more information inside the buffer, enlarge the size a bit. */ + uint32_t size = (*p_size * 9) >> 3; + uint32_t used = 0; + int rc = 0; + int i; + +again: + D_ALLOC(cpca->cpca_buf, size); + if (cpca->cpca_buf == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + rc = crt_proc_create(daos_task2ctx(task), cpca->cpca_buf, size, CRT_PROC_ENCODE, + &cpca->cpca_proc); + if (rc != 0) + goto out; + + for (i = 0; i < coa->coa_dct_nr; i++) { + rc = crt_proc_struct_daos_coll_target(cpca->cpca_proc, CRT_PROC_ENCODE, + &coa->coa_dcts[i]); + if (rc != 0) + goto out; + } + + used = crp_proc_get_size_used(cpca->cpca_proc); + if (unlikely(used > size)) { + crt_proc_destroy(cpca->cpca_proc); + cpca->cpca_proc = NULL; + D_FREE(cpca->cpca_buf); + size = used; + goto again; + } + + cpca->cpca_iov.iov_buf = cpca->cpca_buf; + cpca->cpca_iov.iov_buf_len = used; + cpca->cpca_iov.iov_len = used; + + cpca->cpca_sgl.sg_nr = 1; + cpca->cpca_sgl.sg_nr_out = 1; + cpca->cpca_sgl.sg_iovs = &cpca->cpca_iov; + + rc = obj_bulk_prep(&cpca->cpca_sgl, 1, false, CRT_BULK_RO, task, &cpca->cpca_bulks); + +out: + if (rc != 0) { + if (cpca->cpca_proc != NULL) { + crt_proc_destroy(cpca->cpca_proc); + cpca->cpca_proc = NULL; + } + D_FREE(cpca->cpca_buf); + } else { + *p_size = used; + } + + return rc; +} + +static int +dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, + uint32_t map_ver, daos_obj_punch_t *args, struct obj_auxi_args *auxi) +{ + struct shard_punch_args *spa = &auxi->p_args; + struct coll_oper_args *coa = &spa->pa_coa; + struct dc_obj_shard *shard = NULL; + struct dtx_memberships *mbs = NULL; + struct daos_coll_target *dct; + struct daos_coll_target tmp_tgt; + struct obj_coll_punch_cb_args cpca = { 0 }; + uint32_t tgt_size = 0; + uint32_t mbs_max_size; + uint32_t inline_size; + uint32_t flags = ORF_LEADER; + uint32_t leader; + uint32_t len; + int rc; + int i; + + rc = obj_coll_oper_args_init(coa, obj, true); + if (rc != 0) + goto out; + + for (i = 0; i < obj->cob_shards_nr; i++) { + rc = obj_coll_prep_one(coa, obj, map_ver, i); + if (rc != 0) + goto out; + } + + rc = obj_coll_oper_args_collapse(coa, &tgt_size); + if (rc != 0) + goto out; + + if (auxi->io_retry) { + /* Try to reuse the same leader. */ + rc = obj_shard_open(obj, spa->pa_auxi.shard, map_ver, &shard); + if (rc == 0) { + if (!shard->do_rebuilding && !shard->do_reintegrating) { + leader = shard->do_target_rank; + goto gen_mbs; + } + + obj_shard_close(shard); + shard = NULL; + } else if (rc != -DER_NONEXIST) { + goto out; + } + + /* Then change to new leader for retry. */ + } + + /* Randomly select a rank as the leader. */ + leader = d_rand() % coa->coa_dct_nr; + +new_leader: + dct = &coa->coa_dcts[leader]; + len = dct->dct_bitmap_sz << 3; + + for (i = 0; i < len; i++) { + if (isset(dct->dct_bitmap, i)) { + rc = obj_shard_open(obj, dct->dct_shards[i].dcs_idx, map_ver, &shard); + D_ASSERT(rc == 0); + + if (!shard->do_rebuilding && !shard->do_reintegrating) + goto gen_mbs; + + obj_shard_close(shard); + shard = NULL; + } + } + + /* Try another for leader. */ + leader = (leader + 1) % coa->coa_dct_nr; + goto new_leader; + +gen_mbs: + if (leader != 0) { + memcpy(&tmp_tgt, &coa->coa_dcts[0], sizeof(tmp_tgt)); + memcpy(&coa->coa_dcts[0], &coa->coa_dcts[leader], sizeof(tmp_tgt)); + memcpy(&coa->coa_dcts[leader], &tmp_tgt, sizeof(tmp_tgt)); + } + + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); + if (rc < 0) + goto out; + + inline_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(struct obj_coll_punch_in); + D_ASSERTF(inline_size < DAOS_BULK_LIMIT, + "Too much data to be held inside coll punch RPC body: %u vs %u\n", + inline_size, DAOS_BULK_LIMIT); + + if (inline_size + tgt_size >= DAOS_BULK_LIMIT) { + rc = dc_obj_coll_punch_bulk(task, coa, &cpca, &tgt_size); + if (rc != 0) + goto out; + } + + cpca.cpca_shard = shard; + cpca.cpca_mbs = mbs; + rc = tse_task_register_comp_cb(task, dc_obj_coll_punch_cb, &cpca, sizeof(cpca)); + if (rc != 0) + goto out; + + if (auxi->io_retry) { + flags |= ORF_RESEND; + /* Reset @enqueue_id if resend to new leader. */ + if (spa->pa_auxi.target != shard->do_target_id) + spa->pa_auxi.enqueue_id = 0; + } else { + spa->pa_auxi.obj_auxi = auxi; + daos_dti_gen(&spa->pa_dti, false); + } + + spa->pa_auxi.target = shard->do_target_id; + spa->pa_auxi.shard = shard->do_shard_idx; + + if (obj_is_ec(obj)) + flags |= ORF_EC; + + mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; + + return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, + coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, + args->flags, flags, map_ver, &auxi->map_ver_reply, task); + +out: + if (rc > 0) + rc = 0; + + DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, + "DAOS_OBJ_RPC_COLL_PUNCH for "DF_OID" map_ver %u, task %p", + DP_OID(obj->cob_md.omd_id), map_ver, task); + + if (cpca.cpca_bulks != NULL) { + if (cpca.cpca_bulks[0] != CRT_BULK_NULL) + crt_bulk_free(cpca.cpca_bulks[0]); + D_FREE(cpca.cpca_bulks); + } + + if (cpca.cpca_proc != NULL) + crt_proc_destroy(cpca.cpca_proc); + D_FREE(cpca.cpca_buf); + + if (shard != NULL) + obj_shard_close(shard); + D_FREE(mbs); + + /* obj_coll_oper_args_fini() will be triggered via complete callback. */ + obj_task_complete(task, rc); + + return rc; +} + static int dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, uint32_t map_ver, enum obj_rpc_opc opc, daos_obj_punch_t *api_args) @@ -6673,13 +7191,6 @@ dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, uint32_t grp_cnt; int rc; - if (opc == DAOS_OBJ_RPC_PUNCH && obj->cob_grp_nr > 1) - /* The object have multiple redundancy groups, use DAOS - * internal transaction to handle that to guarantee the - * atomicity of punch object. - */ - return dc_tx_convert(obj, opc, task); - rc = obj_task_init(task, opc, map_ver, api_args->th, &obj_auxi, obj); if (rc != 0) { obj_decref(obj); @@ -6693,6 +7204,46 @@ dc_obj_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epoch, if (opc == DAOS_OBJ_RPC_PUNCH) { obj_ptr2shards(obj, &shard, &shard_cnt, &grp_cnt); + + if (grp_cnt > 1) { + /* + * We support object collective punch since release-2.6 (version 10). + * The conditions to trigger object collective punch are: + * + * 1. The shards count reaches the threshold for collective punch (31 + * by default). Collectively punch object will distribute the RPCs + * load among more engines even if the total RPCs count may be not + * decreased too much. Or + * + * 2. The shards count is twice (or even more) of the engines count. + * Means that there are some shards reside on the same engine(s). + * Collectively punch object will save some RPCs. + * + * If the object has multiple redundancy groups, but cannot match any + * above condition, then we will use internal distributed transaction. + */ + if (dc_obj_proto_version < 10) + D_GOTO(out_task, rc = -DER_NEED_TX); + + if (shard_cnt < 4) + D_GOTO(out_task, rc = -DER_NEED_TX); + + if (shard_cnt < obj_coll_punch_thd) { + struct dc_pool *pool = obj->cob_pool; + + D_RWLOCK_RDLOCK(&pool->dp_map_lock); + if (shard_cnt < pool_map_node_nr(pool->dp_map) << 1) + rc = -DER_NEED_TX; + D_RWLOCK_UNLOCK(&pool->dp_map_lock); + + if (rc != 0) + goto out_task; + } + + obj_auxi->opc = DAOS_OBJ_RPC_COLL_PUNCH; + + return dc_obj_coll_punch(task, obj, epoch, map_ver, api_args, obj_auxi); + } } else { grp_cnt = 1; obj_auxi->dkey_hash = obj_dkey2hash(obj->cob_md.omd_id, api_args->dkey); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 2dd9ef9ac39..696aedb212d 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -61,29 +61,25 @@ obj_shard_addref(struct dc_obj_shard *shard) D_SPIN_UNLOCK(&shard->do_obj->cob_spin); } +static inline void +obj_shard_addref_locked(struct dc_obj_shard *shard) +{ + shard->do_ref++; +} + int dc_obj_shard_open(struct dc_object *obj, daos_unit_oid_t oid, unsigned int mode, struct dc_obj_shard *shard) { - struct pool_target *map_tgt; - int rc; - D_ASSERT(obj != NULL && shard != NULL); D_ASSERT(shard->do_obj == NULL); - rc = dc_pool_tgt_idx2ptr(obj->cob_pool, shard->do_target_id, - &map_tgt); - if (rc) - return rc; - shard->do_id = oid; - shard->do_target_rank = map_tgt->ta_comp.co_rank; - shard->do_target_idx = map_tgt->ta_comp.co_index; shard->do_obj = obj; shard->do_co = obj->cob_co; - obj_shard_addref(shard); /* release this until obj_layout_free */ D_SPIN_LOCK(&obj->cob_spin); + obj_shard_addref_locked(shard); /* release this until obj_layout_free */ obj->cob_shards->do_open_count++; D_SPIN_UNLOCK(&obj->cob_spin); @@ -1288,8 +1284,8 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, opi->opi_shard_tgts.ca_arrays = NULL; } uuid_copy(opi->opi_pool_uuid, pool->dp_pool); - uuid_copy(opi->opi_co_hdl, args->pa_coh_uuid); - uuid_copy(opi->opi_co_uuid, args->pa_cont_uuid); + uuid_copy(opi->opi_co_hdl, shard->do_co->dc_cont_hdl); + uuid_copy(opi->opi_co_uuid, shard->do_co->dc_uuid); daos_dti_copy(&opi->opi_dti, &args->pa_dti); opi->opi_flags = args->pa_auxi.flags; opi->opi_dti_cos.ca_count = 0; @@ -1307,6 +1303,142 @@ dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, return rc; } +struct shard_coll_punch_cb_args { + crt_rpc_t *cpca_rpc; + uint32_t *cpca_ver; + struct shard_punch_args *cpca_shard_args; +}; + +static int +obj_shard_coll_punch_cb(tse_task_t *task, void *data) +{ + struct shard_coll_punch_cb_args *cb_args = data; + crt_rpc_t *rpc = cb_args->cpca_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + if (task->dt_result == 0) { + task->dt_result = obj_reply_get_status(rpc); + *cb_args->cpca_ver = obj_reply_map_version_get(rpc); + } + + if (task->dt_result == -DER_OVERLOAD_RETRY) { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + struct shard_punch_args *shard_args = cb_args->cpca_shard_args; + uint32_t timeout = 0; + + if (shard_args->pa_auxi.enqueue_id == 0) + shard_args->pa_auxi.enqueue_id = ocpo->ocpo_comm_out.req_out_enqueue_id; + crt_req_get_timeout(rpc, &timeout); + if (timeout > shard_args->pa_auxi.obj_auxi->max_delay) + shard_args->pa_auxi.obj_auxi->max_delay = timeout; + } + + DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), + DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + + crt_req_decref(rpc); + + return task->dt_result; +} + +int +dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_memberships *mbs, uint32_t mbs_max_size, crt_bulk_t *bulks, + uint32_t bulk_sz, struct daos_coll_target *tgts, uint32_t tgt_nr, + uint32_t max_tgt_size, struct dtx_epoch *epoch, uint64_t api_flags, + uint32_t rpc_flags, uint32_t map_ver, uint32_t *rep_ver, tse_task_t *task) +{ + struct dc_pool *pool = obj_shard_ptr2pool(shard); + crt_rpc_t *req = NULL; + struct obj_coll_punch_in *ocpi = NULL; + struct shard_coll_punch_cb_args cb_args = { 0 }; + crt_endpoint_t tgt_ep = { 0 }; + int rc = 0; + + D_ASSERT(pool != NULL); + + tgt_ep.ep_grp = pool->dp_sys->sy_group; + tgt_ep.ep_rank = shard->do_target_rank; + tgt_ep.ep_tag = shard->do_target_idx; + + rc = obj_req_create(daos_task2ctx(task), &tgt_ep, DAOS_OBJ_RPC_COLL_PUNCH, &req); + if (rc != 0) + goto out; + + ocpi = crt_req_get(req); + D_ASSERT(ocpi != NULL); + + ocpi->ocpi_xid = args->pa_dti; + ocpi->ocpi_mbs = mbs; + ocpi->ocpi_odm.odm_mbs_max_sz = mbs_max_size; + uuid_copy(ocpi->ocpi_po_uuid, pool->dp_pool); + uuid_copy(ocpi->ocpi_co_hdl, shard->do_co->dc_cont_hdl); + uuid_copy(ocpi->ocpi_co_uuid, shard->do_co->dc_uuid); + ocpi->ocpi_oid = shard->do_id; + ocpi->ocpi_epoch = epoch->oe_value; + ocpi->ocpi_api_flags = api_flags; + ocpi->ocpi_map_ver = map_ver; + ocpi->ocpi_flags = rpc_flags; + + if (bulks != NULL) { + D_ASSERT(bulk_sz != 0); + + ocpi->ocpi_bulk_tgt_sz = bulk_sz; + ocpi->ocpi_bulk_tgt_nr = tgt_nr; + ocpi->ocpi_tgt_bulk = bulks[0]; + ocpi->ocpi_tgts.ca_count = 0; + ocpi->ocpi_tgts.ca_arrays = NULL; + } else { + D_ASSERT(tgts != NULL); + + ocpi->ocpi_bulk_tgt_sz = 0; + ocpi->ocpi_bulk_tgt_nr = 0; + ocpi->ocpi_tgt_bulk = NULL; + ocpi->ocpi_tgts.ca_count = tgt_nr; + ocpi->ocpi_tgts.ca_arrays = tgts; + } + + ocpi->ocpi_max_tgt_sz = max_tgt_size; + ocpi->ocpi_disp_width = 0; + ocpi->ocpi_disp_depth = 0; + + ocpi->ocpi_comm_in.req_in_enqueue_id = args->pa_auxi.enqueue_id; + + crt_req_addref(req); + cb_args.cpca_rpc = req; + cb_args.cpca_ver = rep_ver; + cb_args.cpca_shard_args = args; + + rc = tse_task_register_comp_cb(task, obj_shard_coll_punch_cb, &cb_args, sizeof(cb_args)); + if (rc != 0) + D_GOTO(out_req, rc); + + D_DEBUG(DB_IO, "Sending DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u, bulk_sz %u\n", + req, DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag, bulk_sz); + + return daos_rpc_send(req, task); + +out_req: + /* -1 for crt_req_addref(). */ + crt_req_decref(req); + /* -1 for obj_req_create(). */ + crt_req_decref(req); +out: + D_ERROR("DAOS_OBJ_RPC_COLL_PUNCH RPC failed for "DF_UOID" with DTX " + DF_DTI" for task %p, map_ver %u, flags %lx/%x, leader %u/%u: "DF_RC"\n", + DP_UOID(shard->do_id), DP_DTI(&args->pa_dti), task, map_ver, + (unsigned long)api_flags, rpc_flags, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); + + obj_shard_decref(shard); + tse_task_complete(task, rc); + return rc; +} + struct obj_enum_args { crt_rpc_t *rpc; daos_handle_t *hdlp; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 8a2b12fff55..4950dfd84d2 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -41,6 +41,7 @@ struct obj_io_context; extern bool cli_bypass_rpc; /** Switch of server-side IO dispatch */ extern unsigned int srv_io_mode; +extern unsigned int obj_coll_punch_thd; /* Whether check redundancy group validation when DTX resync. */ extern bool tx_verify_rdg; @@ -215,6 +216,32 @@ typedef int (*shard_io_cb_t)(struct dc_obj_shard *shard, enum obj_rpc_opc opc, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); +struct obj_coll_disp_cursor { + /* + * The length of daos_coll_target array. The obj_coll_disp_cursor may be inside some + * {obj,shard}_auxi_xxx structure, has some size limitation. So the daos_coll_target + * array is not contained inside the obj_coll_disp_cursor. + */ + uint32_t tgt_nr; + /* + * The "grp" is not object redundancy group, instead, it is the set of some engine(s). + * If there is only one engine in the group, then send RPC to such engine. Otherwise, + * choose a relay engine from such group and send RPC to such relay engine that will + * help to forward the RPC to other engines in such group. + */ + uint16_t grp_nr; + /* The count of engine groups that the RPC will be dispatched to. */ + uint16_t pending_grps; + /* Current position in the daos_coll_target array. */ + uint32_t cur_pos; + /* How many engines in the group corresponding to cur_pos. As the process going, the + * count of engines in current group may be smaller than the engines in former group + * unless fixed_step is set. + */ + uint16_t cur_step; + uint16_t fixed_step:1; +}; + /* shard update/punch auxiliary args, must be the first field of * shard_rw_args and shard_punch_args. */ @@ -248,12 +275,30 @@ struct shard_rw_args { struct obj_reasb_req *reasb_req; }; +struct coll_oper_args { + struct shard_auxi_args coa_auxi; + int coa_dct_nr; + uint32_t coa_dct_cap; + uint32_t coa_max_dct_sz; + uint8_t coa_max_shard_nr; + uint8_t coa_max_bitmap_sz; + uint8_t coa_for_modify:1; + uint8_t coa_target_nr; + /* + * The target ID for the top four healthy shards. + * Please check comment for DTX_COLL_INLINE_TARGETS. + */ + uint32_t coa_targets[DTX_COLL_INLINE_TARGETS]; + struct daos_coll_target *coa_dcts; +}; + struct shard_punch_args { - struct shard_auxi_args pa_auxi; - uuid_t pa_coh_uuid; - uuid_t pa_cont_uuid; - struct dtx_id pa_dti; - uint32_t pa_opc; + union { + struct shard_auxi_args pa_auxi; + struct coll_oper_args pa_coa; + }; + struct dtx_id pa_dti; + uint32_t pa_opc; }; struct shard_sub_anchor { @@ -572,6 +617,13 @@ int dc_obj_shard_punch(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); +int dc_obj_shard_coll_punch(struct dc_obj_shard *shard, struct shard_punch_args *args, + struct dtx_memberships *mbs, uint32_t mbs_max_size, crt_bulk_t *bulks, + uint32_t bulk_sz, struct daos_coll_target *tgts, uint32_t tgt_nr, + uint32_t max_tgt_size, struct dtx_epoch *epoch, uint64_t api_flags, + uint32_t rpc_flags, uint32_t map_ver, uint32_t *rep_ver, + tse_task_t *task); + int dc_obj_shard_list(struct dc_obj_shard *shard, enum obj_rpc_opc opc, void *shard_args, struct daos_shard_tgt *fw_shard_tgts, uint32_t fw_cnt, tse_task_t *task); @@ -846,9 +898,32 @@ daos_recx_ep_list_ep_valid(struct daos_recx_ep_list *list) return (list->re_ep_valid == 1); } -int obj_class_init(void); +int obj_class_init(void); void obj_class_fini(void); -int obj_utils_init(void); + +/* + * Consider efficiency, we will not make one leader (or relay) engine to forward + * too many collective requests to other engines. But it also needs to guarantee + * that the payload size for each dispatch group is small enough to be packed in + * RPC body to avoid transferring via RDAM. + * + * On the other hand, parent engine may need to children's feedback before reply + * parent's upper level engine. So making parent engine to forward more requests + * than each child engine does is more efficient because current collective task + * on parent engine is scheduled earlier than on child engine. Otherwise, parent + * engine may wait more time. + */ +#define COLL_DISP_WIDTH_DEF 20 +#define COLL_DISP_WIDTH_MIN 8 +#define COLL_DISP_WIDTH_DIF 4 + +/* obj_utils.c */ +void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, + uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc); +void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, + crt_endpoint_t *tgt_ep); +void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc); +int obj_utils_init(void); void obj_utils_fini(void); /* obj_tx.c */ diff --git a/src/object/obj_rpc.c b/src/object/obj_rpc.c index e7f4e43960b..b42b71a84c4 100644 --- a/src/object/obj_rpc.c +++ b/src/object/obj_rpc.c @@ -500,12 +500,56 @@ crt_proc_struct_daos_shard_tgt(crt_proc_t proc, crt_proc_op_t proc_op, /* For compounded RPC. */ +static int +crt_proc_struct_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, + uint32_t mbs_max_size, struct dtx_memberships **p_mbs) +{ + struct dtx_memberships *mbs = NULL; + uint32_t size = 0; + int rc; + + if (FREEING(proc_op)) { + D_FREE(*p_mbs); + return 0; + } + + if (ENCODING(proc_op)) { + mbs = *p_mbs; + size = sizeof(*mbs) + mbs->dm_data_size; + } + + /* Pack the size of mbs to help decode case. */ + rc = crt_proc_uint32_t(proc, proc_op, &size); + if (unlikely(rc)) + return rc; + + D_ASSERT(size != 0); + + if (DECODING(proc_op)) { + /* Allocate enough buffer to hold delay filled bitmap and targets information. */ + D_ALLOC(mbs, size < mbs_max_size ? mbs_max_size : size); + if (mbs == NULL) + return -DER_NOMEM; + } + + rc = crt_proc_memcpy(proc, proc_op, mbs, size); + if (unlikely(rc)) { + if (DECODING(proc_op)) + D_FREE(mbs); + return rc; + } + + if (DECODING(proc_op)) + *p_mbs = mbs; + + return 0; +} + static int crt_proc_struct_daos_cpd_sub_head(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_cpd_sub_head *dcsh, bool mbs) { - uint32_t size = 0; - int rc; + int rc; if (FREEING(proc_op)) { if (mbs) @@ -529,30 +573,7 @@ crt_proc_struct_daos_cpd_sub_head(crt_proc_t proc, crt_proc_op_t proc_op, if (!mbs) return 0; - if (ENCODING(proc_op)) - /* Pack the size of dcsh->dcsh_mbs to help decode case. */ - size = sizeof(*dcsh->dcsh_mbs) + dcsh->dcsh_mbs->dm_data_size; - - rc = crt_proc_uint32_t(proc, proc_op, &size); - if (unlikely(rc)) - return rc; - - D_ASSERT(size != 0); - - if (DECODING(proc_op)) { - D_ALLOC(dcsh->dcsh_mbs, size); - if (dcsh->dcsh_mbs == NULL) - return -DER_NOMEM; - } - - rc = crt_proc_memcpy(proc, proc_op, dcsh->dcsh_mbs, size); - if (unlikely(rc)) { - if (DECODING(proc_op)) - D_FREE(dcsh->dcsh_mbs); - return rc; - } - - return 0; + return crt_proc_struct_dtx_mbs(proc, proc_op, 0, &dcsh->dcsh_mbs); } static int @@ -848,11 +869,6 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, return rc; } - if (FREEING(proc_op)) { - D_FREE(dcb->dcb_bulk); - return 0; - } - rc = crt_proc_uint32_t(proc, proc_op, &dcb->dcb_size); if (unlikely(rc)) return rc; @@ -871,6 +887,9 @@ crt_proc_struct_daos_cpd_bulk(crt_proc_t proc, crt_proc_op_t proc_op, if (unlikely(rc)) return rc; + if (FREEING(proc_op)) + D_FREE(dcb->dcb_bulk); + /* The other fields will not be packed on-wire. */ return 0; @@ -1082,6 +1101,154 @@ crt_proc_struct_daos_req_comm_out(crt_proc_t proc, crt_proc_op_t proc_op, return 0; } +static int +crt_proc_struct_obj_dtx_mbs(crt_proc_t proc, crt_proc_op_t proc_op, + struct obj_dtx_mbs *odm) +{ + int rc; + + rc = crt_proc_struct_dtx_id(proc, proc_op, &odm->odm_xid); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &odm->odm_mbs_max_sz); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &odm->odm_padding); + if (unlikely(rc)) + return rc; + + return crt_proc_struct_dtx_mbs(proc, proc_op, odm->odm_mbs_max_sz, &odm->odm_mbs); +} + +static int +crt_proc_struct_daos_coll_shard(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_coll_shard *dcs) +{ + int rc = 0; + int i; + + if (FREEING(proc_op)) { + if (dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + return 0; + } + + rc = crt_proc_uint16_t(proc, proc_op, &dcs->dcs_nr); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint16_t(proc, proc_op, &dcs->dcs_cap); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint32_t(proc, proc_op, &dcs->dcs_inline); + if (unlikely(rc)) + return rc; + + if (DECODING(proc_op)) + dcs->dcs_cap = dcs->dcs_nr; + + if (dcs->dcs_nr <= 1) { + if (DECODING(proc_op)) + dcs->dcs_buf = &dcs->dcs_inline; + return 0; + } + + if (DECODING(proc_op)) { + D_ALLOC_ARRAY(dcs->dcs_buf, dcs->dcs_nr); + if (dcs->dcs_buf == NULL) + return -DER_NOMEM; + } + + for (i = 0; i < dcs->dcs_nr; i++) { + rc = crt_proc_uint32_t(proc, proc_op, &dcs->dcs_buf[i]); + if (unlikely(rc)) + goto out; + } + +out: + if (unlikely(rc) && DECODING(proc_op) && dcs->dcs_buf != &dcs->dcs_inline) + D_FREE(dcs->dcs_buf); + return rc; +} + +int +crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_coll_target *dct) +{ + int rc; + int i; + + rc = crt_proc_uint32_t(proc, proc_op, &dct->dct_rank); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_bitmap_sz); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_max_shard); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_tgt_nr); + if (unlikely(rc)) + return rc; + + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_tgt_cap); + if (unlikely(rc)) + return rc; + + if (DECODING(proc_op)) { + D_ALLOC(dct->dct_bitmap, dct->dct_bitmap_sz); + if (dct->dct_bitmap == NULL) + return -DER_NOMEM; + + /* When decode, allocate enough buffer to avoid some XS accessing invalid DRAM. */ + D_ALLOC_ARRAY(dct->dct_shards, dct->dct_bitmap_sz << 3); + if (dct->dct_shards == NULL) + goto out; + } + + rc = crt_proc_memcpy(proc, proc_op, dct->dct_bitmap, dct->dct_bitmap_sz); + if (unlikely(rc)) + goto out; + + for (i = 0; i <= dct->dct_max_shard; i++) { + rc = crt_proc_struct_daos_coll_shard(proc, proc_op, &dct->dct_shards[i]); + if (unlikely(rc)) + goto out; + } + + /* Skip empty dct_tgt_ids. */ + if (unlikely(dct->dct_tgt_cap == 0 || dct->dct_tgt_nr == 0)) + goto out; + + if (FREEING(proc_op)) + goto out; + + if (DECODING(proc_op)) { + D_ALLOC_ARRAY(dct->dct_tgt_ids, dct->dct_tgt_nr); + if (dct->dct_tgt_ids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + for (i = 0; i < dct->dct_tgt_nr; i++) { + rc = crt_proc_uint32_t(proc, proc_op, &dct->dct_tgt_ids[i]); + if (unlikely(rc)) + goto out; + } + +out: + if (FREEING(proc_op) || (unlikely(rc) && DECODING(proc_op))) { + D_FREE(dct->dct_bitmap); + D_FREE(dct->dct_shards); + D_FREE(dct->dct_tgt_ids); + } + + return rc; +} + CRT_RPC_DEFINE(obj_rw, DAOS_ISEQ_OBJ_RW, DAOS_OSEQ_OBJ_RW) CRT_RPC_DEFINE(obj_rw_v10, DAOS_ISEQ_OBJ_RW_V10, DAOS_OSEQ_OBJ_RW_V10) CRT_RPC_DEFINE(obj_key_enum, DAOS_ISEQ_OBJ_KEY_ENUM, DAOS_OSEQ_OBJ_KEY_ENUM) @@ -1098,6 +1265,7 @@ CRT_RPC_DEFINE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) CRT_RPC_DEFINE(obj_ec_rep, DAOS_ISEQ_OBJ_EC_REP, DAOS_OSEQ_OBJ_EC_REP) CRT_RPC_DEFINE(obj_key2anchor, DAOS_ISEQ_OBJ_KEY2ANCHOR, DAOS_OSEQ_OBJ_KEY2ANCHOR) CRT_RPC_DEFINE(obj_key2anchor_v10, DAOS_ISEQ_OBJ_KEY2ANCHOR_V10, DAOS_OSEQ_OBJ_KEY2ANCHOR_V10) +CRT_RPC_DEFINE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) /* Define for obj_proto_rpc_fmt[] array population below. * See OBJ_PROTO_*_RPC_LIST macro definition @@ -1179,6 +1347,9 @@ obj_reply_set_status(crt_rpc_t *rpc, int status) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_status = status; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_ret = status; + break; default: D_ASSERT(0); } @@ -1218,6 +1389,8 @@ obj_reply_get_status(crt_rpc_t *rpc) return ((struct obj_cpd_out *)reply)->oco_ret; case DAOS_OBJ_RPC_EC_REPLICATE: return ((struct obj_ec_rep_out *)reply)->er_status; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_ret; default: D_ASSERT(0); } @@ -1267,6 +1440,9 @@ obj_reply_map_version_set(crt_rpc_t *rpc, uint32_t map_version) case DAOS_OBJ_RPC_EC_REPLICATE: ((struct obj_ec_rep_out *)reply)->er_map_ver = map_version; break; + case DAOS_OBJ_RPC_COLL_PUNCH: + ((struct obj_coll_punch_out *)reply)->ocpo_map_version = map_version; + break; default: D_ASSERT(0); } @@ -1302,6 +1478,8 @@ obj_reply_map_version_get(crt_rpc_t *rpc) return ((struct obj_sync_out *)reply)->oso_map_version; case DAOS_OBJ_RPC_CPD: return ((struct obj_cpd_out *)reply)->oco_map_version; + case DAOS_OBJ_RPC_COLL_PUNCH: + return ((struct obj_coll_punch_out *)reply)->ocpo_map_version; default: D_ASSERT(0); } diff --git a/src/object/obj_rpc.h b/src/object/obj_rpc.h index dba1b31ca74..86a90fbe4d1 100644 --- a/src/object/obj_rpc.h +++ b/src/object/obj_rpc.h @@ -98,7 +98,10 @@ X(DAOS_OBJ_RPC_KEY2ANCHOR, \ 0, ver == 9 ? &CQF_obj_key2anchor : \ &CQF_obj_key2anchor_v10, \ - ds_obj_key2anchor_handler, NULL, "key2anchor") + ds_obj_key2anchor_handler, NULL, "key2anchor") \ + X(DAOS_OBJ_RPC_COLL_PUNCH, \ + 0, &CQF_obj_coll_punch, ds_obj_coll_punch_handler, \ + NULL, "obj_coll_punch") /* Define for RPC enum population below */ #define X(a, b, c, d, e, f) a, @@ -149,8 +152,8 @@ enum obj_rpc_flags { * oei_epr.epr_hi is epoch. */ ORF_ENUM_WITHOUT_EPR = (1 << 8), - /* CPD RPC leader */ - ORF_CPD_LEADER = (1 << 9), + /* RPC leader */ + ORF_LEADER = (1 << 9), /* Bulk data transfer for CPD RPC. */ ORF_CPD_BULK = (1 << 10), /* Contain EC split req, only used on CPD leader locally. Obsolete - DAOS-10348. */ @@ -707,6 +710,42 @@ struct daos_cpd_sg { CRT_RPC_DECLARE(obj_cpd, DAOS_ISEQ_OBJ_CPD, DAOS_OSEQ_OBJ_CPD) +struct obj_dtx_mbs { + struct dtx_id odm_xid; + uint32_t odm_mbs_max_sz; + uint32_t odm_padding; + struct dtx_memberships *odm_mbs; +}; + +#define DAOS_ISEQ_OBJ_COLL_PUNCH /* input fields */ \ + ((struct obj_dtx_mbs) (ocpi_odm) CRT_VAR) \ + ((uuid_t) (ocpi_po_uuid) CRT_VAR) \ + ((uuid_t) (ocpi_co_hdl) CRT_VAR) \ + ((uuid_t) (ocpi_co_uuid) CRT_VAR) \ + ((daos_unit_oid_t) (ocpi_oid) CRT_RAW) \ + ((uint64_t) (ocpi_epoch) CRT_VAR) \ + ((uint64_t) (ocpi_api_flags) CRT_VAR) \ + ((uint32_t) (ocpi_map_ver) CRT_VAR) \ + ((uint32_t) (ocpi_flags) CRT_VAR) \ + ((uint32_t) (ocpi_bulk_tgt_sz) CRT_VAR) \ + ((uint32_t) (ocpi_bulk_tgt_nr) CRT_VAR) \ + ((crt_bulk_t) (ocpi_tgt_bulk) CRT_VAR) \ + ((uint32_t) (ocpi_max_tgt_sz) CRT_VAR) \ + ((uint16_t) (ocpi_disp_width) CRT_VAR) \ + ((uint16_t) (ocpi_disp_depth) CRT_VAR) \ + ((struct daos_coll_target) (ocpi_tgts) CRT_ARRAY) \ + ((struct daos_req_comm_in) (ocpi_comm_in) CRT_VAR) + +#define DAOS_OSEQ_OBJ_COLL_PUNCH /* output fields */ \ + ((int32_t) (ocpo_ret) CRT_VAR) \ + ((uint32_t) (ocpo_map_version) CRT_VAR) \ + ((struct daos_req_comm_out) (ocpo_comm_out) CRT_VAR) + +CRT_RPC_DECLARE(obj_coll_punch, DAOS_ISEQ_OBJ_COLL_PUNCH, DAOS_OSEQ_OBJ_COLL_PUNCH) + +#define ocpi_xid ocpi_odm.odm_xid +#define ocpi_mbs ocpi_odm.odm_mbs + static inline int obj_req_create(crt_context_t crt_ctx, crt_endpoint_t *tgt_ep, crt_opcode_t opc, crt_rpc_t **req) @@ -730,6 +769,8 @@ uint32_t obj_reply_map_version_get(crt_rpc_t *rpc); int crt_proc_struct_daos_cpd_sub_req(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_cpd_sub_req *dcsr, bool with_oid); +int crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, + struct daos_coll_target *dct); static inline bool obj_is_modification_opc(uint32_t opc) @@ -739,7 +780,7 @@ obj_is_modification_opc(uint32_t opc) opc == DAOS_OBJ_RPC_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || opc == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; + opc == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS || opc == DAOS_OBJ_RPC_COLL_PUNCH; } #define DAOS_OBJ_UPDATE_MODE_MASK (DAOS_OO_RW | DAOS_OO_EXCL | \ @@ -751,43 +792,6 @@ obj_is_fetch_opc(uint32_t opc) return opc == DAOS_OBJ_RPC_FETCH; } -static inline bool -obj_is_ec_agg_opc(uint32_t opc) -{ - return opc == DAOS_OBJ_RPC_EC_AGGREGATE || - opc == DAOS_OBJ_RPC_EC_REPLICATE; -} - -static inline bool -obj_rpc_is_update(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; -} - -static inline bool -obj_rpc_is_fetch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; -} - -static inline bool -obj_rpc_is_punch(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_PUNCH_AKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_DKEYS || - opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_PUNCH_AKEYS; -} - -static inline bool -obj_rpc_is_migrate(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_MIGRATE; -} - static inline bool obj_is_enum_opc(uint32_t opc) { @@ -798,40 +802,23 @@ obj_is_enum_opc(uint32_t opc) } static inline bool -obj_rpc_is_query(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_QUERY_KEY; -} - -static inline bool -obj_rpc_is_sync(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_SYNC; -} - -static inline bool -obj_rpc_is_key2anchor(crt_rpc_t *rpc) -{ - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_KEY2ANCHOR; -} - -static inline bool -obj_rpc_is_ec_agg(crt_rpc_t *rpc) +obj_is_ec_agg_opc(uint32_t opc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_AGGREGATE; - + return opc == DAOS_OBJ_RPC_EC_AGGREGATE || + opc == DAOS_OBJ_RPC_EC_REPLICATE; } static inline bool -obj_rpc_is_ec_rep(crt_rpc_t *rpc) +obj_rpc_is_update(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_EC_REPLICATE; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_UPDATE || + opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_TGT_UPDATE; } static inline bool -obj_rpc_is_cpd(crt_rpc_t *rpc) +obj_rpc_is_fetch(crt_rpc_t *rpc) { - return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD; + return opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_FETCH; } #endif /* __DAOS_OBJ_RPC_H__ */ diff --git a/src/object/obj_tx.c b/src/object/obj_tx.c index 6e56cce82e3..82ef196cd3b 100644 --- a/src/object/obj_tx.c +++ b/src/object/obj_tx.c @@ -2305,7 +2305,7 @@ dc_tx_commit_trigger(tse_task_t *task, struct dc_tx *tx, daos_tx_commit_t *args) uuid_copy(oci->oci_pool_uuid, tx->tx_pool->dp_pool); oci->oci_map_ver = tx->tx_pm_ver; - oci->oci_flags = ORF_CPD_LEADER; + oci->oci_flags = ORF_LEADER; if (tx->tx_set_resend && !tx->tx_renew) oci->oci_flags |= ORF_RESEND; tx->tx_renew = 0; @@ -2634,7 +2634,12 @@ dc_tx_restart(tse_task_t *task) /* * Reinitialize task with a delay to implement the * backoff and call dc_tx_restart_end below. + * + * We don't need to get an extra tx reference, because + * the reinitialized task must acquire tx->tx_lock + * first. */ + tse_task_set_priv_internal(task, tx); rc = tse_task_reinit_with_delay(task, backoff); if (rc != 0) { /* Skip the backoff. */ @@ -2643,8 +2648,6 @@ dc_tx_restart(tse_task_t *task) goto out_tx_lock; } D_MUTEX_UNLOCK(&tx->tx_lock); - /* Pass our tx reference to task. */ - tse_task_set_priv_internal(task, tx); return 0; } diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 8312c6719d8..45c773871f0 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -204,6 +204,120 @@ static btr_ops_t recx_btr_ops = { .to_key_decode = recx_key_decode }; +void +obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, + uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc) +{ + if (max_width == 0) { + /* + * Guarantee that the targets information (to be dispatched) can be packed + * inside the RPC body instead of via bulk transfer. + */ + max_width = (inline_size + max_tgt_size) / DAOS_BULK_LIMIT + 1; + if (max_width < COLL_DISP_WIDTH_DEF) + max_width = COLL_DISP_WIDTH_DEF; + } + + if (tgt_nr - start > max_width) { + ocdc->grp_nr = max_width; + ocdc->cur_step = (tgt_nr - start) / max_width; + if ((tgt_nr - start) % max_width != 0) { + ocdc->cur_step++; + ocdc->fixed_step = 0; + } else { + ocdc->fixed_step = 1; + } + } else { + ocdc->grp_nr = tgt_nr - start; + ocdc->cur_step = 1; + ocdc->fixed_step = 1; + } + + ocdc->pending_grps = ocdc->grp_nr; + ocdc->tgt_nr = tgt_nr; + ocdc->cur_pos = start; +} + +void +obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, + crt_endpoint_t *tgt_ep) +{ + struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; + struct daos_coll_target tmp; + unsigned long rand = 0; + uint32_t size; + int pos; + int i; + + if (ocdc->cur_step > 2) { + rand = d_rand(); + /* + * Randomly choose an engine as the relay one for load balance. + * If the one corresponding to "pos" is former moved one, then + * use the "cur_pos" as the relay engine. + */ + pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; + if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { + memcpy(&tmp, &tgts[pos], sizeof(tmp)); + memcpy(&tgts[pos], dct, sizeof(tmp)); + memcpy(dct, &tmp, sizeof(tmp)); + } + } + + size = dct->dct_bitmap_sz << 3; + + /* Randomly choose a XS as the local leader on target engine for load balance. */ + for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { + if (isset(dct->dct_bitmap, i)) { + pos -= dct->dct_shards[i].dcs_nr; + if (pos < 0) + break; + } + } + + D_ASSERT(i < size); + + tgt_ep->ep_tag = i; + tgt_ep->ep_rank = dct->dct_rank; +} + +void +obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc) +{ + ocdc->cur_pos += ocdc->cur_step; + + /* The last one. */ + if (--(ocdc->pending_grps) == 0) { + D_ASSERTF(ocdc->cur_pos == ocdc->tgt_nr, + "COLL disp cursor trouble (1): " + "grp_nr %u, pos %u, step %u (%s), tgt_nr %u\n", + ocdc->grp_nr, ocdc->cur_pos, ocdc->cur_step, + ocdc->fixed_step ? "fixed" : "vary", ocdc->tgt_nr); + return; + } + + D_ASSERTF(ocdc->tgt_nr - ocdc->cur_pos >= ocdc->pending_grps, + "COLL disp cursor trouble (2): " + "pos %u, step %u (%s), tgt_nr %u, grp_nr %u, pending_grps %u\n", + ocdc->cur_pos, ocdc->cur_step, ocdc->fixed_step ? "fixed" : "vary", + ocdc->tgt_nr, ocdc->grp_nr, ocdc->pending_grps); + + if (ocdc->fixed_step) { + D_ASSERTF(ocdc->cur_pos + ocdc->cur_step <= ocdc->tgt_nr, + "COLL disp cursor trouble (3): " + "pos %u, step %u (%s), tgt_nr %u, grp_nr %u, pending_grps %u\n", + ocdc->cur_pos, ocdc->cur_step, ocdc->fixed_step ? "fixed" : "vary", + ocdc->tgt_nr, ocdc->grp_nr, ocdc->pending_grps); + return; + } + + ocdc->cur_step = (ocdc->tgt_nr - ocdc->cur_pos) / ocdc->pending_grps; + if ((ocdc->tgt_nr - ocdc->cur_pos) % ocdc->pending_grps != 0) + ocdc->cur_step++; + else + ocdc->fixed_step = 1; +} + int obj_utils_init(void) { diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 5a771e8018f..71aedeca895 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -89,6 +89,7 @@ struct ec_agg_par_extent { struct ec_agg_stripe { daos_off_t as_stripenum; /* ordinal of stripe, offset/(k*len) */ daos_epoch_t as_hi_epoch; /* highest epoch in stripe */ + daos_epoch_t as_lo_epoch; /* lowest epoch in stripe */ d_list_t as_dextents; /* list of stripe's data extents */ daos_off_t as_stripe_fill; /* amount of stripe covered by data */ uint64_t as_offset; /* start offset in stripe */ @@ -114,6 +115,7 @@ struct ec_agg_entry { struct pl_obj_layout *ae_obj_layout; struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P]; uint32_t ae_grp_idx; + uint32_t ae_is_leader:1; }; /* Parameters used to drive iterate all. @@ -123,13 +125,13 @@ struct ec_agg_param { struct ec_agg_entry ap_agg_entry; /* entry used for each OID */ daos_epoch_range_t ap_epr; /* hi/lo extent threshold */ daos_epoch_t ap_filter_eph; /* Aggregatable filter epoch */ + daos_epoch_t ap_min_unagg_eph; /* minimum unaggregate epoch */ daos_handle_t ap_cont_handle; /* VOS container handle */ int (*ap_yield_func)(void *arg); /* yield function*/ void *ap_yield_arg; /* yield argument */ uint32_t ap_credits_max; /* # of tight loops to yield */ uint32_t ap_credits; /* # of tight loops */ - uint32_t ap_initialized:1, /* initialized flag */ - ap_obj_skipped:1; /* skipped obj during aggregation */ + uint32_t ap_initialized:1; /* initialized flag */ }; /* Struct used to drive offloaded stripe update. @@ -324,6 +326,7 @@ agg_clear_extents(struct ec_agg_entry *entry) D_ASSERT(entry->ae_cur_stripe.as_extent_cnt == 0); } entry->ae_cur_stripe.as_hi_epoch = 0UL; + entry->ae_cur_stripe.as_lo_epoch = 0UL; entry->ae_cur_stripe.as_stripe_fill = 0; entry->ae_cur_stripe.as_has_holes = carry_is_hole ? true : false; } @@ -1858,7 +1861,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) * and all replica extents are newer than parity. */ if (ec_age_stripe_full(entry, ec_age_with_parity(entry))) { - rc = agg_encode_local_parity(entry); + if (entry->ae_is_leader) { + rc = agg_encode_local_parity(entry); + } else { + update_vos = false; + agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph, + entry->ae_cur_stripe.as_lo_epoch); + } goto out; } @@ -1868,6 +1877,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) goto out; } + if (!entry->ae_is_leader) { + update_vos = false; + agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph, + entry->ae_cur_stripe.as_lo_epoch); + goto out; + } + /* With parity and some newer partial replicas, possibly holes */ if (ec_age_with_hole(entry)) process_holes = true; @@ -1951,13 +1967,19 @@ agg_extent_add(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry, agg_in_stripe(agg_entry, recx); } + if (agg_entry->ae_cur_stripe.as_lo_epoch == 0 || + extent->ae_epoch < agg_entry->ae_cur_stripe.as_lo_epoch) + agg_entry->ae_cur_stripe.as_lo_epoch = extent->ae_epoch; + if (extent->ae_epoch > agg_entry->ae_cur_stripe.as_hi_epoch) agg_entry->ae_cur_stripe.as_hi_epoch = extent->ae_epoch; - D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u\n", + D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u" + "max/min "DF_X64"/"DF_X64"\n", DP_RECX(extent->ae_recx), agg_stripenum(agg_entry, extent->ae_recx.rx_idx), - agg_entry->ae_oid.id_shard); + agg_entry->ae_oid.id_shard, agg_entry->ae_cur_stripe.as_hi_epoch, + agg_entry->ae_cur_stripe.as_lo_epoch); out: return rc; } @@ -1973,9 +1995,9 @@ agg_data_extent(struct ec_agg_param *agg_param, vos_iter_entry_t *entry, D_ASSERT(!(entry->ie_recx.rx_idx & PARITY_INDICATOR)); - D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", %u\n", + D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", "DF_X64"/%u leader %s\n", DP_UOID(agg_entry->ae_oid), DP_RECX(entry->ie_recx), - entry->ie_minor_epc); + entry->ie_epoch, entry->ie_minor_epc, agg_entry->ae_is_leader ? "yes" : "no"); while (offset < end) { daos_off_t this_stripenum; @@ -2038,6 +2060,7 @@ agg_akey_post(daos_handle_t ih, struct ec_agg_param *agg_param, agg_entry->ae_cur_stripe.as_stripenum = 0UL; agg_entry->ae_cur_stripe.as_hi_epoch = 0UL; + agg_entry->ae_cur_stripe.as_lo_epoch = 0UL; agg_entry->ae_cur_stripe.as_stripe_fill = 0UL; agg_entry->ae_cur_stripe.as_offset = 0U; } @@ -2073,39 +2096,57 @@ agg_reset_pos(vos_iter_type_t type, struct ec_agg_entry *agg_entry) } } -static int -agg_shard_is_leader(struct ds_pool *pool, struct ec_agg_entry *agg_entry) +static bool +agg_shard_is_parity(struct ds_pool *pool, struct ec_agg_entry *agg_entry) { - struct pl_obj_shard *shard; struct daos_oclass_attr *oca; uint32_t grp_idx; uint32_t grp_start; - uint32_t ec_tgt_idx; - int shard_idx; - int rc; + uint32_t min_fseq = -1; + int leader_shard = -1; + int i; oca = &agg_entry->ae_oca; + if (is_ec_data_shard_by_layout_ver(agg_entry->ae_oid.id_layout_ver, + agg_entry->ae_dkey_hash, oca, + agg_entry->ae_oid.id_shard)) { + agg_entry->ae_is_leader = 0; + return false; + } + grp_idx = agg_entry->ae_oid.id_shard / daos_oclass_grp_size(oca); - grp_start = grp_idx * daos_oclass_grp_size(oca); - ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver, - agg_entry->ae_dkey_hash, oca, - daos_oclass_grp_size(oca) - 1); - /** - * FIXME: only the last parity shard can be the EC agg leader. What about - * Degraded mode? - */ - if (agg_entry->ae_oid.id_shard != ec_tgt_idx + grp_start) - return 0; + grp_start = grp_idx * agg_entry->ae_obj_layout->ol_grp_size; + for (i = 0; i < obj_ec_parity_tgt_nr(oca); i++) { + uint32_t ec_tgt_idx; + uint32_t shard_idx; + struct pl_obj_shard *shard; + + ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver, + agg_entry->ae_dkey_hash, oca, + daos_oclass_grp_size(oca) - i - 1); + + shard_idx = grp_start + ec_tgt_idx; + shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx); - /* If last parity unavailable, then skip the object via returning -DER_STALE. */ - shard_idx = grp_idx * agg_entry->ae_obj_layout->ol_grp_size + ec_tgt_idx; - shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx); - if (shard->po_target != -1 && shard->po_shard != -1 && !shard->po_rebuilding) - rc = (agg_entry->ae_oid.id_shard == shard->po_shard) ? 1 : 0; + if (shard->po_target == -1 || shard->po_shard == -1 || shard->po_rebuilding) + continue; + + if (min_fseq == -1 || min_fseq > shard->po_fseq) { + leader_shard = shard_idx; + min_fseq = shard->po_fseq; + } + } + + /* No parity shard is available */ + if (leader_shard == -1) + return false; + + if (agg_entry->ae_oid.id_shard == leader_shard) + agg_entry->ae_is_leader = 1; else - rc = -DER_STALE; + agg_entry->ae_is_leader = 0; - return rc; + return true; } /* Initializes the struct holding the iteration state (ec_agg_entry). */ @@ -2129,8 +2170,6 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry, struct ec_agg_param *agg_param, struct ec_agg_entry *agg_entry, unsigned int *acts) { - int rc; - if (!agg_key_compare(agg_entry->ae_dkey, entry->ie_key)) { D_DEBUG(DB_EPC, "Skip dkey: "DF_KEY" ec agg on re-probe\n", DP_KEY(&entry->ie_key)); @@ -2144,24 +2183,16 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry, agg_entry->ae_dkey_hash = obj_dkey2hash(agg_entry->ae_oid.id_pub, &agg_entry->ae_dkey); agg_reset_pos(VOS_ITER_AKEY, agg_entry); - rc = agg_shard_is_leader(agg_param->ap_pool_info.api_pool, agg_entry); - if (rc == 1) { - D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting\n", - DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey)); + if(agg_shard_is_parity(agg_param->ap_pool_info.api_pool, agg_entry)) { + D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting leader %s\n", + DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey), + agg_entry->ae_is_leader ? "yes" : "no"); agg_reset_dkey_entry(&agg_param->ap_agg_entry, entry); - rc = 0; } else { - if (rc < 0) { - D_ERROR("oid:"DF_UOID" ds_pool_check_leader failed " - DF_RC"\n", DP_UOID(entry->ie_oid), DP_RC(rc)); - if (rc == -DER_STALE) - agg_param->ap_obj_skipped = 1; - rc = 0; - } *acts |= VOS_ITER_CB_SKIP; } - return rc; + return 0; } /* Handles akeys returned by the iterator. */ @@ -2625,7 +2656,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL); - ec_agg_param->ap_obj_skipped = 0; + ec_agg_param->ap_min_unagg_eph = DAOS_EPOCH_MAX; rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb, agg_iterate_post_cb, ec_agg_param, NULL); @@ -2637,8 +2668,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, ec_agg_param->ap_agg_entry.ae_obj_hdl = DAOS_HDL_INVAL; } - if (ec_agg_param->ap_obj_skipped && !cont->sc_stopping) { - D_DEBUG(DB_EPC, "with skipped obj during aggregation.\n"); + if (cont->sc_pool->spc_pool->sp_rebuilding > 0 && !cont->sc_stopping) { /* There is rebuild going on, and we can't proceed EC aggregate boundary, * Let's wait for 5 seconds for another EC aggregation. */ @@ -2649,10 +2679,22 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, vos_aggregate_exit(cont->sc_hdl); update_hae: - if (rc == 0 && ec_agg_param->ap_obj_skipped == 0) { + if (rc == 0) { cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi); - if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) - *cont->sc_ec_query_agg_eph = cont->sc_ec_agg_eph; + if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) { + uint64_t orig, cur; + + orig = d_hlc2sec(*cont->sc_ec_query_agg_eph); + cur = d_hlc2sec(cont->sc_ec_agg_eph); + if (orig && cur > orig && (cur - orig) >= 600) + D_WARN(DF_CONT" Sluggish EC boundary bumping: " + ""DF_U64" -> "DF_U64", gap:"DF_U64"\n", + DP_CONT(cont->sc_pool_uuid, cont->sc_uuid), + orig, cur, cur - orig); + + *cont->sc_ec_query_agg_eph = min(ec_agg_param->ap_min_unagg_eph, + cont->sc_ec_agg_eph); + } } return rc; diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index 4452e040486..4bb9b086fb7 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -236,11 +236,14 @@ obj_update_latency(uint32_t opc, uint32_t type, uint64_t latency, uint64_t io_si } struct ds_obj_exec_arg { - crt_rpc_t *rpc; - struct obj_io_context *ioc; - void *args; - uint32_t flags; - uint32_t start; /* The start shard for EC obj. */ + crt_rpc_t *rpc; + struct obj_io_context *ioc; + void *args; + uint32_t flags; + uint32_t start; /* The start shard for EC obj. */ + struct daos_coll_shard *coll_shards; + struct daos_coll_target *coll_tgts; + struct obj_coll_disp_cursor coll_cur; }; int @@ -252,6 +255,9 @@ ds_obj_remote_punch(struct dtx_leader_handle *dth, void *arg, int idx, int ds_obj_cpd_dispatch(struct dtx_leader_handle *dth, void *arg, int idx, dtx_sub_comp_cb_t comp_cb); +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dth, void *arg, int idx, + dtx_sub_comp_cb_t comp_cb); /* srv_obj.c */ void ds_obj_rw_handler(crt_rpc_t *rpc); @@ -266,6 +272,7 @@ void ds_obj_migrate_handler(crt_rpc_t *rpc); void ds_obj_ec_agg_handler(crt_rpc_t *rpc); void ds_obj_ec_rep_handler(crt_rpc_t *rpc); void ds_obj_cpd_handler(crt_rpc_t *rpc); +void ds_obj_coll_punch_handler(crt_rpc_t *rpc); typedef int (*ds_iofw_cb_t)(crt_rpc_t *req, void *arg); struct daos_cpd_args { diff --git a/src/object/srv_mod.c b/src/object/srv_mod.c index 72a25ba97de..94099dc3f02 100644 --- a/src/object/srv_mod.c +++ b/src/object/srv_mod.c @@ -213,7 +213,9 @@ struct dss_module_key obj_module_key = { static int obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = 0; D_ASSERT(proto_ver == DAOS_OBJ_VERSION || proto_ver == DAOS_OBJ_VERSION - 1); @@ -226,7 +228,11 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) /* Extract hint from RPC */ attr->sra_enqueue_id = 0; - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_in *orw = crt_req_get(rpc); if (proto_ver >= 10) { @@ -237,12 +243,19 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) sched_req_attr_init(attr, obj_rpc_is_update(rpc) ? SCHED_REQ_UPDATE : SCHED_REQ_FETCH, &orw->orw_pool_uuid); - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_in *omi = crt_req_get(rpc); attr->sra_enqueue_id = omi->om_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &omi->om_pool_uuid); - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_in *oei = crt_req_get(rpc); if (proto_ver >= 10) { @@ -251,7 +264,14 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oei_v10->oei_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oei->oei_pool_uuid); - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_in *opi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -260,7 +280,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = opi_v10->opi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &opi->opi_pool_uuid); - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_in *okqi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -269,7 +291,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = okqi_v10->okqi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &okqi->okqi_pool_uuid); - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_in *osi = crt_req_get(rpc); if (proto_ver >= 10) { @@ -278,7 +302,9 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = osi_v10->osi_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_UPDATE, &osi->osi_pool_uuid); - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_in *oki = crt_req_get(rpc); if (proto_ver >= 10) { @@ -287,102 +313,146 @@ obj_get_req_attr(crt_rpc_t *rpc, struct sched_req_attr *attr) attr->sra_enqueue_id = oki_v10->oki_comm_in.req_in_enqueue_id; } sched_req_attr_init(attr, SCHED_REQ_FETCH, &oki->oki_pool_uuid); - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_in *ea = crt_req_get(rpc); attr->sra_enqueue_id = ea->ea_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &ea->ea_pool_uuid); - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_in *er = crt_req_get(rpc); attr->sra_enqueue_id = er->er_comm_in.req_in_enqueue_id; sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &er->er_pool_uuid); - } else if (obj_rpc_is_cpd(rpc)) { + break; + } + case DAOS_OBJ_RPC_CPD: { struct obj_cpd_in *oci = crt_req_get(rpc); - sched_req_attr_init(attr, SCHED_REQ_MIGRATE, &oci->oci_pool_uuid); - } else { + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &oci->oci_pool_uuid); + break; + } + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + + attr->sra_enqueue_id = ocpi->ocpi_comm_in.req_in_enqueue_id; + sched_req_attr_init(attr, SCHED_REQ_UPDATE, &ocpi->ocpi_po_uuid); + break; + } + default: /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_NOSYS; + rc = -DER_NOSYS; + break; } - return 0; + return rc; } static int obj_set_req(crt_rpc_t *rpc, struct sched_req_attr *attr) { - int proto_ver = crt_req_get_proto_ver(rpc); + int opc = opc_get(rpc->cr_opc); + int proto_ver = crt_req_get_proto_ver(rpc); + int rc = -DER_OVERLOAD_RETRY; /* Old protocol RPCs won't be rejected. */ D_ASSERT(proto_ver == DAOS_OBJ_VERSION); - if (obj_rpc_is_update(rpc) || obj_rpc_is_fetch(rpc)) { + + switch (opc) { + case DAOS_OBJ_RPC_UPDATE: + case DAOS_OBJ_RPC_TGT_UPDATE: + case DAOS_OBJ_RPC_FETCH: { struct obj_rw_v10_out *orwo_v10 = crt_reply_get(rpc); orwo_v10->orw_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; orwo_v10->orw_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_migrate(rpc)) { + break; + } + case DAOS_OBJ_RPC_MIGRATE: { struct obj_migrate_out *om = crt_reply_get(rpc); om->om_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; om->om_status = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_is_enum_opc(rpc->cr_opc)) { + break; + } + case DAOS_OBJ_DKEY_RPC_ENUMERATE: + case DAOS_OBJ_RPC_ENUMERATE: + case DAOS_OBJ_AKEY_RPC_ENUMERATE: + case DAOS_OBJ_RECX_RPC_ENUMERATE: { struct obj_key_enum_v10_out *oeo_v10 = crt_reply_get(rpc); oeo_v10->oeo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oeo_v10->oeo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_punch(rpc)) { + break; + } + case DAOS_OBJ_RPC_PUNCH: + case DAOS_OBJ_RPC_PUNCH_DKEYS: + case DAOS_OBJ_RPC_PUNCH_AKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_TGT_PUNCH_DKEYS: + case DAOS_OBJ_RPC_TGT_PUNCH_AKEYS: { struct obj_punch_v10_out *opo_v10 = crt_reply_get(rpc); opo_v10->opo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; opo_v10->opo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_query(rpc)) { + break; + } + case DAOS_OBJ_RPC_QUERY_KEY: { struct obj_query_key_v10_out *okqo_v10 = crt_reply_get(rpc); okqo_v10->okqo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; okqo_v10->okqo_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_sync(rpc)) { + break; + } + case DAOS_OBJ_RPC_SYNC: { struct obj_sync_v10_out *oso_v10 = crt_reply_get(rpc); oso_v10->oso_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oso_v10->oso_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_key2anchor(rpc)) { + break; + } + case DAOS_OBJ_RPC_KEY2ANCHOR: { struct obj_key2anchor_v10_out *oko_v10 = crt_reply_get(rpc); oko_v10->oko_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; oko_v10->oko_ret = -DER_OVERLOAD_RETRY; - - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_agg(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_AGGREGATE: { struct obj_ec_agg_out *ea_out = crt_reply_get(rpc); ea_out->ea_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; ea_out->ea_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_ec_rep(rpc)) { + break; + } + case DAOS_OBJ_RPC_EC_REPLICATE: { struct obj_ec_rep_out *er_out = crt_reply_get(rpc); er_out->er_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; er_out->er_status = -DER_OVERLOAD_RETRY; - return -DER_OVERLOAD_RETRY; - } else if (obj_rpc_is_cpd(rpc)) { - /* No RPC retry for DTX, client will retry anyway. */ - return -DER_TIMEDOUT; + break; + } + case DAOS_OBJ_RPC_CPD: + /* NOTE: It needs to be enhanced. Currently, just let client retry anyway. */ + rc = -DER_TIMEDOUT; + break; + case DAOS_OBJ_RPC_COLL_PUNCH: { + struct obj_coll_punch_out *ocpo = crt_reply_get(rpc); + + ocpo->ocpo_comm_out.req_out_enqueue_id = attr->sra_enqueue_id; + ocpo->ocpo_ret = -DER_OVERLOAD_RETRY; + break; } - /* Other requests will not be queued, see dss_rpc_hdlr() */ - return -DER_TIMEDOUT; + default: + /* Other requests will not be queued, see dss_rpc_hdlr() */ + rc = -DER_TIMEDOUT; + break; + } + + return rc; } static struct dss_module_ops ds_obj_mod_ops = { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index ceca7728b24..9cd0473effd 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -25,7 +25,7 @@ #include #include #include -#include "daos_srv/srv_csum.h" +#include #include "obj_rpc.h" #include "srv_internal.h" @@ -2596,8 +2596,6 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; } /* Inject failure for test to simulate the case of lost some @@ -2787,6 +2785,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) int dti_cos_cnt; uint32_t tgt_cnt; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; bool need_abort = false; @@ -2857,6 +2856,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) } version = orw->orw_map_ver; + max_ver = orw->orw_map_ver; if (tgt_cnt == 0) { if (!(orw->orw_api_flags & DAOS_COND_MASK)) @@ -2873,7 +2873,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; d_tm_inc_counter(opm->opm_update_resent, 1); again1: @@ -2936,7 +2935,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) rc = dtx_leader_begin(ioc.ioc_vos_coh, &orw->orw_dti, &epoch, 1, version, &orw->orw_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for update " DF_RC "\n", DP_UOID(orw->orw_oid), DP_RC(rc)); @@ -2951,6 +2950,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) /* Execute the operation on all targets */ rc = dtx_leader_exec_ops(dlh, obj_tgt_update, NULL, 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distributed transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3004,6 +3006,9 @@ ds_obj_rw_handler(crt_rpc_t *rpc) DP_DTI(&orw->orw_dti), DP_RC(rc1)); } + if (ioc.ioc_map_ver < max_ver) + ioc.ioc_map_ver = max_ver; + obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); D_FREE(mbs); D_FREE(dti_cos); @@ -3453,6 +3458,7 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, switch (opc) { case DAOS_OBJ_RPC_PUNCH: case DAOS_OBJ_RPC_TGT_PUNCH: + case DAOS_OBJ_RPC_COLL_PUNCH: rc = vos_obj_punch(cont->sc_hdl, opi->opi_oid, opi->opi_epoch, opi->opi_map_ver, 0, NULL, 0, NULL, dth); @@ -3542,59 +3548,57 @@ obj_local_punch(struct obj_punch_in *opi, crt_opcode_t opc, return rc; } -/* Handle the punch requests on non-leader */ -void -ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +struct obj_tgt_punch_args { + uint32_t opc; + struct obj_io_context *sponsor_ioc; + struct dtx_handle *sponsor_dth; + struct obj_punch_in *opi; + struct dtx_memberships *mbs; + uint32_t *ver; + void *data; +}; + +static int +obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count) { - struct dtx_handle *dth = NULL; - struct obj_io_context ioc; - struct obj_punch_in *opi; - struct dtx_memberships *mbs = NULL; - struct daos_shard_tgt *tgts = NULL; - uint32_t dtx_flags = 0; - uint32_t tgt_cnt; - struct dtx_epoch epoch; - int rc; + struct obj_io_context ioc = { 0 }; + struct obj_io_context *p_ioc = otpa->sponsor_ioc; + struct dtx_handle *dth = otpa->sponsor_dth; + struct obj_punch_in *opi = otpa->opi; + struct dtx_epoch epoch; + daos_epoch_t tmp; + uint32_t dtx_flags = 0; + int rc = 0; + int i; - opi = crt_req_get(rpc); - D_ASSERT(opi != NULL); - rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, - opi->opi_pool_uuid, opi->opi_co_hdl, - opi->opi_co_uuid, rpc, opi->opi_flags, &ioc); - if (rc) - goto out; + if (p_ioc == NULL) { + p_ioc = &ioc; + rc = obj_ioc_begin(opi->opi_oid.id_pub, opi->opi_map_ver, opi->opi_pool_uuid, + opi->opi_co_hdl, opi->opi_co_uuid, otpa->data, opi->opi_flags, + p_ioc); + if (rc != 0) + goto out; + } - /* Handle resend. */ - if (opi->opi_flags & ORF_RESEND) { - daos_epoch_t e = opi->opi_epoch; + if (dth != NULL) + goto exec; - rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, NULL); + if (opi->opi_flags & ORF_RESEND) { + tmp = opi->opi_epoch; + rc = dtx_handle_resend(p_ioc->ioc_vos_coh, &opi->opi_dti, &tmp, NULL); /* Do nothing if 'prepared' or 'committed'. */ if (rc == -DER_ALREADY || rc == 0) D_GOTO(out, rc = 0); - /* Abort it firstly if exist but with different epoch, - * then re-execute with new epoch. - */ + /* Abort old one with different epoch, then re-execute with new epoch. */ if (rc == -DER_MISMATCH) /* Abort it by force with MAX epoch to guarantee * that it can be aborted. */ - rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + rc = vos_dtx_abort(p_ioc->ioc_vos_coh, &opi->opi_dti, tmp); if (rc < 0 && rc != -DER_NONEXIST) D_GOTO(out, rc); - - dtx_flags |= DTX_RESEND; - } - - tgts = opi->opi_shard_tgts.ca_arrays; - tgt_cnt = opi->opi_shard_tgts.ca_count; - - if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { - rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &mbs); - if (rc != 0) - D_GOTO(out, rc); } epoch.oe_value = opi->opi_epoch; @@ -3605,10 +3609,9 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) dtx_flags |= DTX_SYNC; /* Start the local transaction */ - rc = dtx_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, - opi->opi_map_ver, &opi->opi_oid, - opi->opi_dti_cos.ca_arrays, - opi->opi_dti_cos.ca_count, dtx_flags, mbs, &dth); + rc = dtx_begin(p_ioc->ioc_vos_coh, &opi->opi_dti, &epoch, count, opi->opi_map_ver, + &opi->opi_oid, opi->opi_dti_cos.ca_arrays, opi->opi_dti_cos.ca_count, + dtx_flags, otpa->mbs, &dth); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3618,19 +3621,59 @@ ds_obj_tgt_punch_handler(crt_rpc_t *rpc) if (DAOS_FAIL_CHECK(DAOS_DTX_NONLEADER_ERROR)) D_GOTO(out, rc = -DER_IO); - rc = obj_local_punch(opi, opc_get(rpc->cr_opc), &ioc, dth); - if (rc != 0) - DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); +exec: + /* There may be multiple shards reside on the same VOS target. */ + for (i = 0; i < count; i++) { + opi->opi_oid.id_shard = shards[i]; + rc = obj_local_punch(opi, otpa->opc, p_ioc, dth); + if (rc != 0) { + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + goto out; + } + } out: - /* Stop the local transaction */ - if (dth != NULL) - rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); - D_FREE(mbs); - obj_ioc_end(&ioc, rc); + if (otpa->ver != NULL) + *otpa->ver = p_ioc->ioc_map_ver; + + if (dth != NULL && dth != otpa->sponsor_dth) + rc = dtx_end(dth, p_ioc->ioc_coc, rc); + + if (p_ioc == &ioc) + obj_ioc_end(p_ioc, rc); + + return rc; +} + +/* Handle the punch requests on non-leader */ +void +ds_obj_tgt_punch_handler(crt_rpc_t *rpc) +{ + struct obj_tgt_punch_args otpa = { 0 }; + struct obj_punch_in *opi = crt_req_get(rpc); + struct daos_shard_tgt *tgts = opi->opi_shard_tgts.ca_arrays; + uint32_t tgt_cnt = opi->opi_shard_tgts.ca_count; + uint32_t version = 0; + int rc; + + if (!daos_is_zero_dti(&opi->opi_dti) && tgt_cnt != 0) { + rc = obj_gen_dtx_mbs(opi->opi_flags, &tgt_cnt, &tgts, &otpa.mbs); + if (rc != 0) + D_GOTO(out, rc); + } + + otpa.opc = opc_get(rpc->cr_opc); + otpa.opi = opi; + otpa.ver = &version; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, &opi->opi_oid.id_shard, 1); + +out: + obj_punch_complete(rpc, rc, version); + D_FREE(otpa.mbs); } static int @@ -3654,13 +3697,18 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) for (i = 0; i < sub_cnt; i++) { sub = &dlh->dlh_subs[i]; if (sub->dss_tgt.st_rank != DAOS_TGT_IGNORE && sub->dss_comp) { - if (sub->dss_result == 0) + if (sub->dss_result == 0) { succeeds++; - else if (sub->dss_result == allow_failure) + } else if (sub->dss_result == allow_failure) { allow_failure_cnt++; - else if (result == -DER_INPROGRESS || result == 0) - /* Ignore INPROGRESS if there is other failure. */ + } else if (result == -DER_INPROGRESS || result == -DER_AGAIN || + result == 0) { + /* Ignore INPROGRESS and AGAIN if there is other failure. */ result = sub->dss_result; + + if (dlh->dlh_rmt_ver < sub->dss_version) + dlh->dlh_rmt_ver = sub->dss_version; + } } } @@ -3675,8 +3723,7 @@ obj_punch_agg_cb(struct dtx_leader_handle *dlh, int allow_failure) } static int -obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, - dtx_sub_comp_cb_t comp_cb) +obj_tgt_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) { struct ds_obj_exec_arg *exec_arg = arg; @@ -3694,10 +3741,9 @@ obj_tgt_punch(struct dtx_leader_handle *dlh, void *arg, int idx, rc = obj_local_punch(opi, opc_get(rpc->cr_opc), exec_arg->ioc, &dlh->dlh_handle); if (rc != 0) - DL_CDEBUG( - rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || - (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), - DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); + DL_CDEBUG(rc == -DER_INPROGRESS || rc == -DER_TX_RESTART || + (rc == -DER_NONEXIST && (opi->opi_api_flags & DAOS_COND_PUNCH)), + DB_IO, DLOG_ERR, rc, DF_UOID, DP_UOID(opi->opi_oid)); comp: if (comp_cb != NULL) @@ -3726,6 +3772,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t flags = 0; uint32_t dtx_flags = 0; uint32_t version = 0; + uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; bool need_abort = false; @@ -3765,6 +3812,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; version = opi->opi_map_ver; + max_ver = opi->opi_map_ver; tgts = opi->opi_shard_tgts.ca_arrays; tgt_cnt = opi->opi_shard_tgts.ca_count; @@ -3786,8 +3834,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; - dtx_flags |= DTX_RESEND; - again1: e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, @@ -3848,7 +3894,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = dtx_leader_begin(ioc.ioc_vos_coh, &opi->opi_dti, &epoch, 1, version, &opi->opi_oid, dti_cos, dti_cos_cnt, - tgts, tgt_cnt, dtx_flags, mbs, &dlh); + tgts, tgt_cnt, dtx_flags, mbs, NULL /* dce */, &dlh); if (rc != 0) { D_ERROR(DF_UOID ": Failed to start DTX for punch " DF_RC "\n", DP_UOID(opi->opi_oid), DP_RC(rc)); @@ -3860,10 +3906,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.flags = flags; /* Execute the operation on all shards */ - rc = dtx_leader_exec_ops(dlh, obj_tgt_punch, obj_punch_agg_cb, + rc = dtx_leader_exec_ops(dlh, obj_tgt_punch_disp, obj_punch_agg_cb, (opi->opi_api_flags & DAOS_COND_PUNCH) ? -DER_NONEXIST : 0, &exec_arg); + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + /* Stop the distribute transaction */ rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { @@ -3904,7 +3953,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) DP_DTI(&opi->opi_dti), DP_RC(rc1)); } - obj_punch_complete(rpc, rc, ioc.ioc_map_ver); + obj_punch_complete(rpc, rc, max_ver); cleanup: D_FREE(mbs); @@ -4649,8 +4698,6 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) /* Do nothing if 'prepared' or 'committed'. */ if (rc1 == -DER_ALREADY || rc1 == 0) D_GOTO(out, rc = 0); - - dtx_flags |= DTX_RESEND; } /* Refuse any modification with old epoch. */ @@ -4697,9 +4744,9 @@ ds_obj_dtx_follower(crt_rpc_t *rpc, struct obj_io_context *ioc) rc = ds_cpd_handle_one_wrap(rpc, dcsh, dcde, dcsr, ioc, dth); /* For the case of only containing read sub operations, we will - * generate DTX entry for DTX recovery. Similarly for noop case. + * generate DTX entry for DTX recovery. */ - if (rc == 0 && (dth->dth_modification_cnt == 0 || !dth->dth_active)) + if (rc == 0 && dth->dth_modification_cnt == 0) rc = vos_dtx_attach(dth, true, false); rc = dtx_end(dth, ioc->ioc_coc, rc); @@ -4822,8 +4869,6 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) D_ASSERT(dcsh->dcsh_epoch.oe_value != DAOS_EPOCH_MAX); if (oci->oci_flags & ORF_RESEND) { - dtx_flags |= DTX_RESEND; - again: /* For distributed transaction, the 'ORF_RESEND' may means * that the DTX has been restarted with newer epoch. @@ -4900,11 +4945,10 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) else dtx_flags &= ~DTX_PREPARED; - rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, - &dcsh->dcsh_epoch, dcde->dcde_write_cnt, - oci->oci_map_ver, &dcsh->dcsh_leader_oid, - NULL, 0, tgts, tgt_cnt - 1, dtx_flags, - dcsh->dcsh_mbs, &dlh); + rc = dtx_leader_begin(dca->dca_ioc->ioc_vos_coh, &dcsh->dcsh_xid, &dcsh->dcsh_epoch, + dcde->dcde_write_cnt, oci->oci_map_ver, &dcsh->dcsh_leader_oid, + NULL /* dti_cos */, 0 /* dti_cos_cnt */, tgts, tgt_cnt - 1, + dtx_flags, dcsh->dcsh_mbs, NULL /* dce */, &dlh); if (rc != 0) goto out; @@ -5163,7 +5207,7 @@ ds_obj_cpd_handler(crt_rpc_t *rpc) D_ASSERT(oci != NULL); - if (oci->oci_flags & ORF_CPD_LEADER) + if (oci->oci_flags & ORF_LEADER) leader = true; else leader = false; @@ -5352,3 +5396,504 @@ ds_obj_key2anchor_handler(crt_rpc_t *rpc) if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } + +struct obj_coll_tgt_args { + crt_rpc_t *octa_rpc; + struct daos_coll_shard *octa_shards; + uint32_t *octa_versions; + uint32_t octa_sponsor_tgt; + struct obj_io_context *octa_sponsor_ioc; + struct dtx_handle *octa_sponsor_dth; + union { + void *octa_misc; + /* Different collective operations may need different parameters. */ + struct dtx_memberships *octa_mbs; + }; +}; + +static int +obj_coll_tgt_punch(void *args) +{ + struct obj_coll_tgt_args *octa = args; + crt_rpc_t *rpc = octa->octa_rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct obj_punch_in *opi = NULL; + struct obj_tgt_punch_args otpa = { 0 }; + uint32_t tgt_id = dss_get_module_info()->dmi_tgt_id; + int rc; + + D_ALLOC_PTR(opi); + if (opi == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + opi->opi_dti = ocpi->ocpi_xid; + uuid_copy(opi->opi_pool_uuid, ocpi->ocpi_po_uuid); + uuid_copy(opi->opi_co_hdl, ocpi->ocpi_co_hdl); + uuid_copy(opi->opi_co_uuid, ocpi->ocpi_co_uuid); + opi->opi_oid = ocpi->ocpi_oid; + opi->opi_oid.id_shard = octa->octa_shards[tgt_id].dcs_buf[0]; + opi->opi_epoch = ocpi->ocpi_epoch; + opi->opi_api_flags = ocpi->ocpi_api_flags; + opi->opi_map_ver = ocpi->ocpi_map_ver; + opi->opi_flags = ocpi->ocpi_flags & ~ORF_LEADER; + + otpa.opi = opi; + otpa.opc = opc_get(rpc->cr_opc); + if (tgt_id == octa->octa_sponsor_tgt) { + otpa.sponsor_ioc = octa->octa_sponsor_ioc; + otpa.sponsor_dth = octa->octa_sponsor_dth; + } + otpa.mbs = octa->octa_mbs; + if (octa->octa_versions != NULL) + otpa.ver = &octa->octa_versions[tgt_id]; + otpa.data = rpc; + + rc = obj_tgt_punch(&otpa, octa->octa_shards[tgt_id].dcs_buf, + octa->octa_shards[tgt_id].dcs_nr); + D_FREE(opi); + +out: + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj shard "DF_UOID" with "DF_DTI" on tgt %u", + DP_OID(ocpi->ocpi_oid.id_pub), octa->octa_shards[tgt_id].dcs_buf[0], + ocpi->ocpi_oid.id_layout_ver, DP_DTI(&ocpi->ocpi_xid), tgt_id); + + return rc; +} + +typedef int (*obj_coll_func_t)(void *args); + +static int +obj_coll_local(crt_rpc_t *rpc, struct daos_coll_shard *shards, struct dtx_coll_entry *dce, + uint32_t *version, struct obj_io_context *ioc, struct dtx_handle *dth, void *args, + obj_coll_func_t func) +{ + struct obj_coll_tgt_args octa = { 0 }; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + uint32_t size = dce->dce_bitmap_sz << 3; + int rc = 0; + int i; + + D_ASSERT(dce->dce_bitmap != NULL); + D_ASSERT(ioc != NULL); + + if (version != NULL) { + if (size > dss_tgt_nr) + size = dss_tgt_nr; + D_ALLOC_ARRAY(octa.octa_versions, size); + if (octa.octa_versions == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + octa.octa_rpc = rpc; + octa.octa_shards = shards; + octa.octa_misc = args; + octa.octa_sponsor_ioc = ioc; + octa.octa_sponsor_dth = dth; + octa.octa_sponsor_tgt = dss_get_module_info()->dmi_tgt_id; + + coll_ops.co_func = func; + coll_args.ca_func_args = &octa; + coll_args.ca_tgt_bitmap = dce->dce_bitmap; + coll_args.ca_tgt_bitmap_sz = dce->dce_bitmap_sz; + + rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_USE_CURRENT_ULT); + +out: + if (octa.octa_versions != NULL) { + for (i = 0, *version = 0; i < size; i++) { + if (isset(dce->dce_bitmap, i) && *version < octa.octa_versions[i]) + *version = octa.octa_versions[i]; + } + D_FREE(octa.octa_versions); + } + + return rc; +} + +static int +obj_coll_punch_disp(struct dtx_leader_handle *dlh, void *arg, int idx, dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = arg; + crt_rpc_t *rpc = exec_arg->rpc; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + int rc; + + if (idx != -1) + return ds_obj_coll_punch_remote(dlh, arg, idx, comp_cb); + + /* Local punch on current rank, including the leader target. */ + rc = obj_coll_local(rpc, exec_arg->coll_shards, dlh->dlh_coll_entry, NULL, exec_arg->ioc, + &dlh->dlh_handle, dlh->dlh_handle.dth_mbs, obj_coll_tgt_punch); + + DL_CDEBUG(rc == 0 || rc == -DER_INPROGRESS || rc == -DER_TX_RESTART, DB_IO, DLOG_ERR, rc, + "Collective punch obj "DF_UOID" with "DF_DTI" on rank %u", + DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), dss_self_rank()); + + if (comp_cb != NULL) + comp_cb(dlh, idx, rc); + + return rc; +} + +static int +obj_coll_punch_bulk(crt_rpc_t *rpc, d_iov_t *iov, crt_proc_t *p_proc, + struct daos_coll_target **p_dcts, uint32_t *dct_nr) +{ + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct daos_coll_target *dcts = NULL; + crt_proc_t proc = NULL; + d_sg_list_t sgl; + d_sg_list_t *sgls = &sgl; + int rc = 0; + int i; + int j; + + D_ALLOC(iov->iov_buf, ocpi->ocpi_bulk_tgt_sz); + if (iov->iov_buf == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + iov->iov_buf_len = ocpi->ocpi_bulk_tgt_sz; + iov->iov_len = ocpi->ocpi_bulk_tgt_sz; + + sgl.sg_nr = 1; + sgl.sg_nr_out = 1; + sgl.sg_iovs = iov; + + rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &ocpi->ocpi_tgt_bulk, NULL, NULL, + DAOS_HDL_INVAL, &sgls, 1, NULL, NULL); + if (rc != 0) + goto out; + + rc = crt_proc_create(dss_get_module_info()->dmi_ctx, iov->iov_buf, iov->iov_len, + CRT_PROC_DECODE, &proc); + if (rc != 0) + goto out; + + D_ALLOC_ARRAY(dcts, ocpi->ocpi_bulk_tgt_nr); + if (dcts == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < ocpi->ocpi_bulk_tgt_nr; i++) { + rc = crt_proc_struct_daos_coll_target(proc, CRT_PROC_DECODE, &dcts[i]); + if (rc != 0) { + crt_proc_reset(proc, iov->iov_buf, iov->iov_len, CRT_PROC_FREE); + for (j = 0; j < i; j++) + crt_proc_struct_daos_coll_target(proc, CRT_PROC_FREE, &dcts[j]); + goto out; + } + } + +out: + if (rc != 0) { + D_FREE(dcts); + if (proc != NULL) + crt_proc_destroy(proc); + daos_iov_free(iov); + } else { + *p_proc = proc; + *p_dcts = dcts; + *dct_nr = ocpi->ocpi_bulk_tgt_nr; + } + + return rc; +} + +static int +obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dcts, uint32_t dct_nr, + struct dtx_coll_entry **p_dce) +{ + struct pl_map *map = NULL; + struct dtx_memberships *mbs = ocpi->ocpi_mbs; + struct dtx_daos_target *ddt = mbs->dm_tgts; + struct dtx_coll_entry *dce = NULL; + struct dtx_coll_target *target; + d_rank_t max_rank = 0; + uint32_t size; + int rc = 0; + int i; + int j; + + /* dcts[0] is for current engine. */ + if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL) + D_GOTO(out, rc = -DER_INVAL); + + /* Already allocated enough space in MBS when decode to hold the targets and bitmap. */ + target = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); + + size = sizeof(*ddt) * mbs->dm_tgt_cnt + sizeof(*target) + + sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr + dcts[0].dct_bitmap_sz; + if (unlikely(ocpi->ocpi_odm.odm_mbs_max_sz < sizeof(*mbs) + size)) { + D_ERROR("Pre-allocated MBS buffer is too small: %u vs %ld + %u\n", + ocpi->ocpi_odm.odm_mbs_max_sz, sizeof(*mbs), size); + D_GOTO(out, rc = -DER_INVAL); + } + + target->dct_tgt_nr = dcts[0].dct_tgt_nr; + memcpy(target->dct_tgts, dcts[0].dct_tgt_ids, + sizeof(dcts[0].dct_tgt_ids[0]) * dcts[0].dct_tgt_nr); + target->dct_bitmap_sz = dcts[0].dct_bitmap_sz; + memcpy(target->dct_tgts + target->dct_tgt_nr, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); + mbs->dm_data_size = size; + + D_ALLOC_PTR(dce); + if (dce == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_xid = ocpi->ocpi_xid; + dce->dce_ver = ocpi->ocpi_map_ver; + dce->dce_refs = 1; + + D_ALLOC(dce->dce_bitmap, dcts[0].dct_bitmap_sz); + if (dce->dce_bitmap == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_bitmap_sz = dcts[0].dct_bitmap_sz; + memcpy(dce->dce_bitmap, dcts[0].dct_bitmap, dcts[0].dct_bitmap_sz); + + if (!(ocpi->ocpi_flags & ORF_LEADER) || unlikely(dct_nr <= 1)) + D_GOTO(out, rc = 0); + + map = pl_map_find(ocpi->ocpi_po_uuid, ocpi->ocpi_oid.id_pub); + if (map == NULL) { + D_ERROR("Failed to find valid placement map in pool "DF_UUID"\n", + DP_UUID(ocpi->ocpi_po_uuid)); + D_GOTO(out, rc = -DER_INVAL); + } + + size = pool_map_node_nr(map->pl_poolmap); + D_ALLOC_ARRAY(dce->dce_hints, size); + if (dce->dce_hints == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dce->dce_ranks = d_rank_list_alloc(dct_nr - 1); + if (dce->dce_ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + /* Set i = 1 to skip leader_rank. */ + for (i = 1; i < dct_nr; i++) { + dce->dce_ranks->rl_ranks[i - 1] = dcts[i].dct_rank; + if (max_rank < dcts[i].dct_rank) + max_rank = dcts[i].dct_rank; + + size = dcts[i].dct_bitmap_sz << 3; + if (size > dss_tgt_nr) + size = dss_tgt_nr; + + for (j = 0; j < size; j++) { + if (isset(dcts[i].dct_bitmap, j)) { + dce->dce_hints[dcts[i].dct_rank] = j; + break; + } + } + } + + dce->dce_hint_sz = max_rank + 1; + +out: + if (map != NULL) + pl_map_decref(map); + + if (rc != 0 && dce != NULL) + dtx_coll_entry_put(dce); + else + *p_dce = dce; + + return rc; +} + +void +ds_obj_coll_punch_handler(crt_rpc_t *rpc) +{ + struct dss_module_info *dmi = dss_get_module_info(); + struct dtx_leader_handle *dlh = NULL; + struct obj_coll_punch_in *ocpi = crt_req_get(rpc); + struct obj_dtx_mbs *odm = &ocpi->ocpi_odm; + struct ds_obj_exec_arg exec_arg = { 0 }; + struct obj_io_context ioc = { 0 }; + struct dtx_coll_entry *dce = NULL; + struct daos_coll_target *dcts = NULL; + d_iov_t iov = { 0 }; + crt_proc_t proc = NULL; + uint32_t dct_nr = 0; + uint32_t flags = 0; + uint32_t dtx_flags = DTX_TGT_COLL; + uint32_t version = 0; + uint32_t max_ver = 0; + struct dtx_epoch epoch; + daos_epoch_t tmp; + int rc; + int rc1; + int i; + bool need_abort = false; + + D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " + DF_DTI", forward width %u, forward depth %u\n", + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : + (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), + rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + + D_ASSERT(dmi->dmi_xs_id != 0); + + rc = obj_ioc_begin(ocpi->ocpi_oid.id_pub, ocpi->ocpi_map_ver, ocpi->ocpi_po_uuid, + ocpi->ocpi_co_hdl, ocpi->ocpi_co_uuid, rpc, ocpi->ocpi_flags, &ioc); + if (rc != 0) + goto out; + + if (ocpi->ocpi_flags & ORF_LEADER && ocpi->ocpi_bulk_tgt_sz > 0) { + rc = obj_coll_punch_bulk(rpc, &iov, &proc, &dcts, &dct_nr); + if (rc != 0) + goto out; + } else { + dcts = ocpi->ocpi_tgts.ca_arrays; + dct_nr = ocpi->ocpi_tgts.ca_count; + } + + rc = obj_coll_punch_prep(ocpi, dcts, dct_nr, &dce); + if (rc != 0) + goto out; + + if (ocpi->ocpi_flags & ORF_LEADER) { + rc = process_epoch(&ocpi->ocpi_epoch, NULL /* epoch_first */, &ocpi->ocpi_flags); + if (rc == PE_OK_LOCAL) + ocpi->ocpi_flags &= ~ORF_EPOCH_UNCERTAIN; + } else if (dct_nr == 1) { + rc = obj_coll_local(rpc, dcts[0].dct_shards, dce, &version, &ioc, NULL, + odm->odm_mbs, obj_coll_tgt_punch); + goto out; + } + + version = ocpi->ocpi_map_ver; + max_ver = ocpi->ocpi_map_ver; + + if (ocpi->ocpi_flags & ORF_DTX_SYNC) + dtx_flags |= DTX_SYNC; + + if (!(ocpi->ocpi_flags & ORF_LEADER)) + dtx_flags |= DTX_RELAY; + + if (ocpi->ocpi_flags & ORF_RESEND) { + +again1: + tmp = 0; + rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); + switch (rc) { + case -DER_ALREADY: + D_GOTO(out, rc = 0); + case 0: + ocpi->ocpi_epoch = tmp; + flags |= ORF_RESEND; + /* TODO: Also recovery the epoch uncertainty. */ + break; + case -DER_NONEXIST: + rc = 0; + break; + default: + D_GOTO(out, rc); + } + + dce->dce_ver = version; + } + +again2: + epoch.oe_value = ocpi->ocpi_epoch; + epoch.oe_first = epoch.oe_value; + epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); + + if (flags & ORF_RESEND) + dtx_flags |= DTX_PREPARED; + else + dtx_flags &= ~DTX_PREPARED; + + exec_arg.rpc = rpc; + exec_arg.ioc = &ioc; + exec_arg.flags = flags; + exec_arg.coll_shards = dcts[0].dct_shards; + exec_arg.coll_tgts = dcts; + obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz, + sizeof(*ocpi) + sizeof(*odm->odm_mbs) + odm->odm_mbs->dm_data_size, + 1 /* start, [0] is for current engine */, ocpi->ocpi_disp_width, + &exec_arg.coll_cur); + + rc = dtx_leader_begin(ioc.ioc_vos_coh, &odm->odm_xid, &epoch, 1, version, + &ocpi->ocpi_oid, NULL /* dti_cos */, 0 /* dti_cos_cnt */, + NULL /* tgts */, exec_arg.coll_cur.grp_nr /* tgt_cnt */, + dtx_flags, odm->odm_mbs, dce, &dlh); + if (rc != 0) { + D_ERROR(DF_UOID ": Failed to start DTX for collective punch: "DF_RC"\n", + DP_UOID(ocpi->ocpi_oid), DP_RC(rc)); + D_GOTO(out, rc); + } + + /* Execute the operation on all shards */ + rc = dtx_leader_exec_ops(dlh, obj_coll_punch_disp, NULL, 0, &exec_arg); + + if (max_ver < dlh->dlh_rmt_ver) + max_ver = dlh->dlh_rmt_ver; + + rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); + + if (dtx_flags & DTX_RELAY) + goto out; + + switch (rc) { + case -DER_TX_RESTART: + ocpi->ocpi_epoch = d_hlc_get(); + ocpi->ocpi_flags &= ~ORF_RESEND; + flags = 0; + goto again2; + case -DER_AGAIN: + ocpi->ocpi_flags |= ORF_RESEND; + need_abort = true; + ABT_thread_yield(); + goto again1; + default: + break; + } + +out: + if (rc != 0 && need_abort) { + rc1 = dtx_coll_abort(ioc.ioc_coc, dce, ocpi->ocpi_epoch); + if (rc1 != 0 && rc1 != -DER_NONEXIST) + D_WARN("Failed to collective abort DTX "DF_DTI": "DF_RC"\n", + DP_DTI(&ocpi->ocpi_xid), DP_RC(rc1)); + } + + if (max_ver < ioc.ioc_map_ver) + max_ver = ioc.ioc_map_ver; + + if (max_ver < version) + max_ver = version; + + DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, + "(%s) handled collective punch RPC %p for obj " + DF_UOID" on XS %u/%u epc "DF_X64" pmv %u/%u, with dti " + DF_DTI", forward width %u, forward depth %u", + (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : + (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_disp_width, + ocpi->ocpi_disp_depth); + + obj_punch_complete(rpc, rc, max_ver); + + dtx_coll_entry_put(dce); + if (proc != NULL) { + D_ASSERT(dcts != NULL); + + crt_proc_reset(proc, iov.iov_buf, iov.iov_len, CRT_PROC_FREE); + for (i = 0; i < dct_nr; i++) + crt_proc_struct_daos_coll_target(proc, CRT_PROC_FREE, &dcts[i]); + crt_proc_destroy(proc); + D_FREE(dcts); + daos_iov_free(&iov); + } + + /* It is no matter even if obj_ioc_begin() was not called. */ + obj_ioc_end(&ioc, rc); +} diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 121f843df97..70ce7973553 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -996,7 +996,12 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, offset = iods[i].iod_recxs[0].rx_idx; size = iods[i].iod_recxs[0].rx_nr; - parity_eph = ephs[i][0]; + /* Use stable epoch for partial parity update to make sure + * these partial updates are not below stable epoch boundary, + * otherwise both EC and VOS aggregation might operate on + * the same recxs. + */ + parity_eph = encode ? ephs[i][0] : mrone->mo_epoch; tmp_iod = iods[i]; ptr = iov[i].iov_buf; for (j = 1; j < iods[i].iod_nr; j++) { @@ -1059,10 +1064,10 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, /* If the epoch is higher than EC aggregate boundary, then * it should use stable epoch to fetch the data, since * the data could be aggregated independently on parity - * and data shard, so using stable epoch could make sure - * the consistency view during rebuild. And also EC aggregation - * should already aggregate the parity, so there should not - * be any partial update on the parity as well. + * and data shard, so it should select the minimum epoch + * between stable epoch and boundary epoch as the recovery + * epoch to make sure parity data rebuilt will not be interfered + * by the newer update. * * Otherwise there might be partial update on this rebuilding * shard, so let's use the epoch from the parity shard to fetch @@ -1073,7 +1078,7 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, */ if (ds_cont->sc_ec_agg_eph_boundary > mrone->mo_iods_update_ephs_from_parity[i][j]) - fetch_eph = mrone->mo_epoch; + fetch_eph = min(ds_cont->sc_ec_agg_eph_boundary, mrone->mo_epoch); else fetch_eph = mrone->mo_iods_update_ephs_from_parity[i][j]; diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index f78cc03b07c..66d36ec0885 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,18 +35,23 @@ struct obj_remote_cb_arg { }; static void -do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_update_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_rw_out *orwo = crt_reply_get(req); struct obj_rw_in *orw_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (orw_parent->orw_map_ver < orwo->orw_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(orw_parent->orw_oid), orw_parent->orw_map_ver, orwo->orw_map_version); + sub->dss_version = orwo->orw_map_version; rc1 = -DER_STALE; } else { rc1 = orwo->orw_ret; @@ -60,12 +65,6 @@ do_shard_update_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_update_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_update_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - static void obj_inherit_timeout(crt_rpc_t *parent, crt_rpc_t *child) { @@ -135,14 +134,13 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, orw_parent = crt_req_get(parent_req); orw = crt_req_get(req); *orw = *orw_parent; + orw->orw_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(orw->orw_co_hdl, orw_parent->orw_co_hdl); - uuid_copy(orw->orw_co_uuid, orw_parent->orw_co_uuid); orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; - orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; - orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; + orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; + orw->orw_dti_cos.ca_arrays = dth->dth_dti_cos; D_DEBUG(DB_TRACE, DF_UOID" forwarding to rank:%d tag:%d.\n", DP_UOID(orw->orw_oid), tgt_ep.ep_rank, tgt_ep.ep_tag); @@ -165,18 +163,23 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_punch_req_cb(const struct crt_cb_info *cb_info) { + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; crt_rpc_t *parent_req = arg->parent_req; struct obj_punch_out *opo = crt_reply_get(req); - struct obj_punch_in *opi_parent = crt_req_get(req); + struct obj_punch_in *opi_parent = crt_req_get(parent_req); struct dtx_leader_handle *dlh = arg->dlh; - int rc1 = 0; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; if (opi_parent->opi_map_ver < opo->opo_map_version) { D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", DP_UOID(opi_parent->opi_oid), opi_parent->opi_map_ver, opo->opo_map_version); + sub->dss_version = opo->opo_map_version; rc1 = -DER_STALE; } else { rc1 = opo->opo_ret; @@ -190,12 +193,6 @@ do_shard_punch_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_punch_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_punch_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Execute punch on the remote target */ int ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, @@ -213,6 +210,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, struct obj_punch_in *opi_parent; crt_opcode_t opc; int rc = 0; + bool sent_rpc = false; D_ASSERT(idx < dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt); sub = &dlh->dlh_subs[idx]; @@ -248,11 +246,8 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, opi_parent = crt_req_get(parent_req); opi = crt_req_get(req); *opi = *opi_parent; + opi->opi_oid.id_shard = shard_tgt->st_shard_id; - uuid_copy(opi->opi_co_hdl, opi_parent->opi_co_hdl); - uuid_copy(opi->opi_co_uuid, opi_parent->opi_co_uuid); - opi->opi_shard_tgts.ca_count = opi_parent->opi_shard_tgts.ca_count; - opi->opi_shard_tgts.ca_arrays = opi_parent->opi_shard_tgts.ca_arrays; opi->opi_flags |= obj_exec_arg->flags; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; @@ -268,10 +263,11 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, D_ASSERT(sub->dss_comp == 1); D_ERROR("crt_req_send failed, rc "DF_RC"\n", DP_RC(rc)); } - return rc; + + sent_rpc = true; out: - if (rc) { + if (!sent_rpc) { sub->dss_result = rc; comp_cb(dlh, idx, rc); if (remote_arg) { @@ -283,9 +279,12 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, } static void -do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) +shard_cpd_req_cb(const struct crt_cb_info *cb_info) { - struct obj_cpd_out *oco = crt_reply_get(req); + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + struct obj_cpd_out *oco = crt_reply_get(req); + int rc = cb_info->cci_rc; if (rc >= 0) rc = oco->oco_ret; @@ -298,12 +297,6 @@ do_shard_cpd_req_cb(crt_rpc_t *req, struct obj_remote_cb_arg *arg, int rc) D_FREE(arg); } -static inline void -shard_cpd_req_cb(const struct crt_cb_info *cb_info) -{ - do_shard_cpd_req_cb(cb_info->cci_rpc, cb_info->cci_arg, cb_info->cci_rc); -} - /* Dispatch CPD RPC and handle sub requests remotely */ int ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, @@ -375,7 +368,7 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, uuid_copy(oci->oci_co_hdl, oci_parent->oci_co_hdl); uuid_copy(oci->oci_co_uuid, oci_parent->oci_co_uuid); oci->oci_map_ver = oci_parent->oci_map_ver; - oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_CPD_LEADER; + oci->oci_flags = (oci_parent->oci_flags | exec_arg->flags) & ~ORF_LEADER; oci->oci_disp_tgts.ca_arrays = NULL; oci->oci_disp_tgts.ca_count = 0; @@ -461,3 +454,127 @@ ds_obj_cpd_dispatch(struct dtx_leader_handle *dlh, void *arg, int idx, return rc; } + +static void +shard_coll_punch_req_cb(const struct crt_cb_info *cb_info) +{ + struct obj_remote_cb_arg *arg = cb_info->cci_arg; + crt_rpc_t *req = cb_info->cci_rpc; + crt_rpc_t *parent_req = arg->parent_req; + struct obj_coll_punch_out *ocpo = crt_reply_get(req); + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); + struct dtx_leader_handle *dlh = arg->dlh; + struct dtx_sub_status *sub = &dlh->dlh_subs[arg->idx]; + int rc = cb_info->cci_rc; + int rc1; + + if (ocpi_parent->ocpi_map_ver < ocpo->ocpo_map_version) { + D_DEBUG(DB_IO, DF_UOID": map_ver stale (%d < %d).\n", + DP_UOID(ocpi_parent->ocpi_oid), ocpi_parent->ocpi_map_ver, + ocpo->ocpo_map_version); + sub->dss_version = ocpo->ocpo_map_version; + rc1 = -DER_STALE; + } else { + rc1 = ocpo->ocpo_ret; + } + + if (rc >= 0) + rc = rc1; + + arg->comp_cb(dlh, arg->idx, rc); + crt_req_decref(parent_req); + D_FREE(arg); +} + +int +ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, + dtx_sub_comp_cb_t comp_cb) +{ + struct ds_obj_exec_arg *exec_arg = data; + struct obj_coll_disp_cursor *cursor = &exec_arg->coll_cur; + struct obj_remote_cb_arg *remote_arg; + struct dtx_sub_status *sub; + crt_endpoint_t tgt_ep = { 0 }; + crt_rpc_t *parent_req = exec_arg->rpc; + crt_rpc_t *req; + struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi; + int tag; + int rc = 0; + bool sent_rpc = false; + + D_ASSERT(idx < dlh->dlh_normal_sub_cnt); + + sub = &dlh->dlh_subs[idx]; + + D_ALLOC_PTR(remote_arg); + if (remote_arg == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + tag = tgt_ep.ep_tag; + + crt_req_addref(parent_req); + remote_arg->parent_req = parent_req; + remote_arg->dlh = dlh; + remote_arg->comp_cb = comp_cb; + remote_arg->idx = idx; + + rc = obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_COLL_PUNCH, &req); + if (rc != 0) { + D_ERROR("crt_req_create failed for coll punch: "DF_RC"\n", DP_RC(rc)); + D_GOTO(out, rc); + } + + ocpi_parent = crt_req_get(parent_req); + ocpi = crt_req_get(req); + + ocpi->ocpi_odm = ocpi_parent->ocpi_odm; + uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid); + uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl); + uuid_copy(ocpi->ocpi_co_uuid, ocpi_parent->ocpi_co_uuid); + ocpi->ocpi_oid = ocpi_parent->ocpi_oid; + ocpi->ocpi_oid.id_shard = exec_arg->coll_tgts[cursor->cur_pos].dct_shards[tag].dcs_buf[0]; + ocpi->ocpi_epoch = ocpi_parent->ocpi_epoch; + ocpi->ocpi_api_flags = ocpi_parent->ocpi_api_flags; + ocpi->ocpi_map_ver = ocpi_parent->ocpi_map_ver; + ocpi->ocpi_flags = (exec_arg->flags | ocpi_parent->ocpi_flags) & ~ORF_LEADER; + ocpi->ocpi_bulk_tgt_sz = 0; + ocpi->ocpi_bulk_tgt_nr = 0; + ocpi->ocpi_tgt_bulk = NULL; + ocpi->ocpi_max_tgt_sz = ocpi_parent->ocpi_max_tgt_sz; + if (cursor->grp_nr < COLL_DISP_WIDTH_MIN) { + ocpi->ocpi_disp_width = cursor->grp_nr; + } else { + ocpi->ocpi_disp_width = cursor->grp_nr - COLL_DISP_WIDTH_DIF; + if (ocpi->ocpi_disp_width < COLL_DISP_WIDTH_MIN) + ocpi->ocpi_disp_width = COLL_DISP_WIDTH_MIN; + } + ocpi->ocpi_disp_depth = ocpi_parent->ocpi_disp_depth + 1; + ocpi->ocpi_tgts.ca_count = cursor->cur_step; + ocpi->ocpi_tgts.ca_arrays = &exec_arg->coll_tgts[cursor->cur_pos]; + + D_DEBUG(DB_IO, DF_UOID" broadcast collective punch RPC with flags %x/"DF_X64"\n", + DP_UOID(ocpi->ocpi_oid), ocpi->ocpi_flags, ocpi->ocpi_api_flags); + + obj_coll_disp_move(cursor); + + rc = crt_req_send(req, shard_coll_punch_req_cb, remote_arg); + if (rc != 0) { + D_ASSERT(sub->dss_comp == 1); + D_ERROR("crt_req_send failed for collective punch remote: "DF_RC"\n", DP_RC(rc)); + } + + sent_rpc = true; + +out: + if (!sent_rpc) { + sub->dss_result = rc; + comp_cb(dlh, idx, rc); + if (remote_arg != NULL) { + crt_req_decref(parent_req); + D_FREE(remote_arg); + } + } + return rc; +} diff --git a/src/pipeline/srv_pipeline.c b/src/pipeline/srv_pipeline.c index d9659021820..eec8022f2ad 100644 --- a/src/pipeline/srv_pipeline.c +++ b/src/pipeline/srv_pipeline.c @@ -13,7 +13,7 @@ #include #include #include -#include "daos_api.h" +#include #include "pipeline_rpc.h" #include "pipeline_internal.h" diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 86052df05a5..3ef280ab8fc 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -706,6 +706,8 @@ get_object_layout(struct pl_jump_map *jmap, uint32_t layout_ver, struct pl_obj_l layout->ol_shards[k].po_target = target->ta_comp.co_id; layout->ol_shards[k].po_fseq = target->ta_comp.co_fseq; layout->ol_shards[k].po_shard = k; + layout->ol_shards[k].po_rank = target->ta_comp.co_rank; + layout->ol_shards[k].po_index = target->ta_comp.co_index; /** If target is failed queue it for remap*/ if (need_remap_comp(&target->ta_comp, allow_status)) { diff --git a/src/placement/pl_map.h b/src/placement/pl_map.h index 5803c87d58e..7c1335e64bd 100644 --- a/src/placement/pl_map.h +++ b/src/placement/pl_map.h @@ -85,6 +85,8 @@ struct failed_shard { uint32_t fs_shard_idx; uint32_t fs_fseq; uint32_t fs_tgt_id; + uint16_t fs_rank; + uint8_t fs_index; uint8_t fs_status; }; diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index 37743b74fc3..9cbfadd6d18 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -74,6 +74,8 @@ remap_alloc_one(d_list_t *remap_list, unsigned int shard_idx, D_INIT_LIST_HEAD(&f_new->fs_list); f_new->fs_shard_idx = shard_idx; f_new->fs_fseq = tgt->ta_comp.co_fseq; + f_new->fs_rank = tgt->ta_comp.co_rank; + f_new->fs_index = tgt->ta_comp.co_index; f_new->fs_status = tgt->ta_comp.co_status; f_new->fs_data = data; @@ -321,6 +323,8 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, /* The selected spare target is up and ready */ l_shard->po_target = spare_tgt->ta_comp.co_id; l_shard->po_fseq = f_shard->fs_fseq; + l_shard->po_rank = spare_tgt->ta_comp.co_rank; + l_shard->po_index = spare_tgt->ta_comp.co_index; /* * Mark the shard as 'rebuilding' so that read will @@ -421,6 +425,8 @@ pl_map_extend(struct pl_obj_layout *layout, d_list_t *extended_list) new_shards[grp_idx].po_fseq = f_shard->fs_fseq; new_shards[grp_idx].po_shard = f_shard->fs_shard_idx; new_shards[grp_idx].po_target = f_shard->fs_tgt_id; + new_shards[grp_idx].po_rank = f_shard->fs_rank; + new_shards[grp_idx].po_index = f_shard->fs_index; if (f_shard->fs_status != PO_COMP_ST_DRAIN) new_shards[grp_idx].po_rebuilding = 1; diff --git a/src/placement/ring_map.c b/src/placement/ring_map.c index 48b1247b357..d123ef982b9 100644 --- a/src/placement/ring_map.c +++ b/src/placement/ring_map.c @@ -1076,9 +1076,11 @@ ring_obj_layout_fill(struct pl_map *map, struct daos_obj_md *md, pos = plts[idx].pt_pos; tgt = &tgts[pos]; - layout->ol_shards[k].po_shard = rop->rop_shard_id + k; + layout->ol_shards[k].po_shard = rop->rop_shard_id + k; layout->ol_shards[k].po_target = tgt->ta_comp.co_id; - layout->ol_shards[k].po_fseq = tgt->ta_comp.co_fseq; + layout->ol_shards[k].po_fseq = tgt->ta_comp.co_fseq; + layout->ol_shards[k].po_rank = tgt->ta_comp.co_rank; + layout->ol_shards[k].po_index = tgt->ta_comp.co_index; if (pool_target_unavail(tgt, for_reint)) { rc = remap_alloc_one(remap_list, k, tgt, for_reint, NULL); diff --git a/src/pool/cli.c b/src/pool/cli.c index 5b5283f2573..22516be7a27 100644 --- a/src/pool/cli.c +++ b/src/pool/cli.c @@ -1719,6 +1719,9 @@ map_refresh_cb(tse_task_t *task, void *varg) bool reinit = false; int rc = task->dt_result; + /* Get an extra reference for the reinit case. */ + dc_pool_get(pool); + /* * If it turns out below that we do need to update the cached pool map, * then holding the lock while doing so will be okay, since we probably @@ -1843,6 +1846,7 @@ map_refresh_cb(tse_task_t *task, void *varg) dc_pool_put(arg->mra_pool); } + dc_pool_put(pool); return rc; } @@ -1857,6 +1861,9 @@ map_refresh(tse_task_t *task) struct map_refresh_cb_arg cb_arg; int rc; + /* Get an extra reference for the reinit cases. */ + dc_pool_get(pool); + if (arg->mra_passive) { /* * Passive pool map refresh tasks do nothing besides waiting @@ -1921,7 +1928,7 @@ map_refresh(tse_task_t *task) DP_UUID(pool->dp_pool), task, DP_RC(rc)); goto out_task; } - goto out; + goto out_pool; } if (pool->dp_map_task == NULL) { @@ -1969,7 +1976,7 @@ map_refresh(tse_task_t *task) DP_UUID(pool->dp_pool), query_task, DP_RC(rc)); goto out_map_task; } - goto out; + goto out_pool; } /* @@ -2001,6 +2008,7 @@ map_refresh(tse_task_t *task) D_DEBUG(DB_MD, DF_UUID": %p: asking rank %u for version > %u\n", DP_UUID(pool->dp_pool), task, rank, version); + dc_pool_put(pool); return daos_rpc_send(rpc, task); out_cb_arg: @@ -2014,7 +2022,8 @@ map_refresh(tse_task_t *task) d_backoff_seq_fini(&arg->mra_backoff_seq); dc_pool_put(arg->mra_pool); tse_task_complete(task, rc); -out: +out_pool: + dc_pool_put(pool); return rc; } diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index e0370d1ee68..0d9854fd436 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -229,7 +229,7 @@ void ds_pool_tgt_discard_handler(crt_rpc_t *rpc); */ bool ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank); int ds_pool_plan_svc_reconfs(int svc_rf, struct pool_map *map, d_rank_list_t *replicas, - d_rank_t self, d_rank_list_t **to_add_out, + d_rank_t self, bool filter_only, d_rank_list_t **to_add_out, d_rank_list_t **to_remove_out); int ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, crt_rpc_t *rpc, crt_bulk_t remote_bulk, diff --git a/src/pool/srv_iv.c b/src/pool/srv_iv.c index 55a0141d7cc..92970ff3d5f 100644 --- a/src/pool/srv_iv.c +++ b/src/pool/srv_iv.c @@ -794,6 +794,31 @@ pool_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key, return rc; } +int +ds_pool_iv_refresh_hdl(struct ds_pool *pool, struct pool_iv_hdl *pih) +{ + int rc; + + if (!uuid_is_null(pool->sp_srv_cont_hdl)) { + if (uuid_compare(pool->sp_srv_cont_hdl, + pih->pih_cont_hdl) == 0) + return 0; + ds_cont_tgt_close(pool->sp_srv_cont_hdl); + D_DEBUG(DB_MD, "delete hdl "DF_UUID"n", DP_UUID(pool->sp_srv_cont_hdl)); + uuid_clear(pool->sp_srv_cont_hdl); + uuid_clear(pool->sp_srv_pool_hdl); + } + + rc = ds_cont_tgt_open(pool->sp_uuid, pih->pih_cont_hdl, NULL, 0, + ds_sec_get_rebuild_cont_capabilities(), 0); + if (rc == 0) { + uuid_copy(pool->sp_srv_cont_hdl, pih->pih_cont_hdl); + uuid_copy(pool->sp_srv_pool_hdl, pih->pih_pool_hdl); + } + + return rc; +} + static int pool_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key, d_sg_list_t *src, void **priv) @@ -877,6 +902,8 @@ pool_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key, break; conn = pool_iv_conn_next(conn); } + } else if (entry->iv_class->iv_class_id == IV_POOL_HDL) { + rc = ds_pool_iv_refresh_hdl(pool, &src_iv->piv_hdl); } /* Since pool_tgt_connect/prop_update/refresh_hdl might yield due to @@ -897,30 +924,6 @@ pool_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key, return rc; } -int -ds_pool_iv_refresh_hdl(struct ds_pool *pool, struct pool_iv_hdl *pih) -{ - int rc; - - if (!uuid_is_null(pool->sp_srv_cont_hdl)) { - if (uuid_compare(pool->sp_srv_cont_hdl, - pih->pih_cont_hdl) == 0) - return 0; - ds_cont_tgt_close(pool->sp_srv_cont_hdl); - uuid_clear(pool->sp_srv_cont_hdl); - uuid_clear(pool->sp_srv_pool_hdl); - } - - rc = ds_cont_tgt_open(pool->sp_uuid, pih->pih_cont_hdl, NULL, 0, - ds_sec_get_rebuild_cont_capabilities(), 0); - if (rc == 0) { - uuid_copy(pool->sp_srv_cont_hdl, pih->pih_cont_hdl); - uuid_copy(pool->sp_srv_pool_hdl, pih->pih_pool_hdl); - } - - return rc; -} - static int pool_iv_ent_invalid(struct ds_iv_entry *entry, struct ds_iv_key *key) { diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 119cfcea584..6396e6bea59 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -806,53 +806,31 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co * calling d_rank_list_free(*ranksp). */ static int -select_svc_ranks(int svc_rf, const d_rank_list_t *target_addrs, int ndomains, - const uint32_t *domains, d_rank_list_t **ranksp) +select_svc_ranks(int svc_rf, struct pool_buf *map_buf, uint32_t map_version, + d_rank_list_t **ranksp) { - int nreplicas = ds_pool_svc_rf_to_nreplicas(svc_rf); - int selectable; - d_rank_list_t *rnd_tgts; - d_rank_list_t *ranks; - int i; - int j; - int rc; + struct pool_map *map; + d_rank_list_t replicas = {0}; + d_rank_list_t *to_add; + d_rank_list_t *to_remove; + int rc; - rc = d_rank_list_dup(&rnd_tgts, target_addrs); + rc = pool_map_create(map_buf, map_version, &map); if (rc != 0) return rc; - /* Shuffle the target ranks to avoid overloading any particular ranks. */ - /* - * DAOS-9177: Temporarily disable shuffle to give us more time to stabilize tests. - */ - /*daos_rank_list_shuffle(rnd_tgts);*/ - - /* Determine the number of selectable targets. */ - selectable = rnd_tgts->rl_nr; - - if (nreplicas > selectable) - nreplicas = selectable; - ranks = daos_rank_list_alloc(nreplicas); - if (ranks == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - /* TODO: Choose ranks according to failure domains. */ - j = 0; - for (i = 0; i < rnd_tgts->rl_nr; i++) { - if (j == ranks->rl_nr) - break; - D_DEBUG(DB_MD, "ranks[%d]: %u\n", j, rnd_tgts->rl_ranks[i]); - ranks->rl_ranks[j] = rnd_tgts->rl_ranks[i]; - j++; - } - D_ASSERTF(j == ranks->rl_nr, "%d == %u\n", j, ranks->rl_nr); + rc = ds_pool_plan_svc_reconfs(svc_rf, map, &replicas, CRT_NO_RANK /* self */, + false /* filter_only */, &to_add, &to_remove); + pool_map_decref(map); + if (rc != 0) + return rc; + D_ASSERTF(to_remove->rl_nr == 0, "to_remove=%u\n", to_remove->rl_nr); + d_rank_list_free(to_remove); - *ranksp = ranks; - rc = 0; + d_rank_list_sort(to_add); -out: - d_rank_list_free(rnd_tgts); - return rc; + *ranksp = to_add; + return 0; } /* TODO: replace all rsvc_complete_rpc() calls in this file with pool_rsvc_complete_rpc() */ @@ -904,6 +882,8 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, daos_prop_t *prop, d_rank_list_t **svc_addrs) { struct daos_prop_entry *svc_rf_entry; + struct pool_buf *map_buf; + uint32_t map_version = 1; d_rank_list_t *ranks; d_iov_t psid; struct rsvc_client client; @@ -935,6 +915,11 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, D_ASSERTF(ntargets == target_addrs->rl_nr, "ntargets=%d num=%u\n", ntargets, target_addrs->rl_nr); + rc = gen_pool_buf(NULL /* map */, &map_buf, map_version, ndomains, target_addrs->rl_nr, + target_addrs->rl_nr * dss_tgt_nr, domains, dss_tgt_nr); + if (rc != 0) + goto out; + svc_rf_entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_REDUN_FAC); D_ASSERT(svc_rf_entry != NULL && !(svc_rf_entry->dpe_flags & DAOS_PROP_ENTRY_NOT_SET)); D_ASSERTF(daos_svc_rf_is_valid(svc_rf_entry->dpe_val), DF_U64"\n", svc_rf_entry->dpe_val); @@ -942,9 +927,9 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, D_DEBUG(DB_MD, DF_UUID": creating PS: ntargets=%d ndomains=%d svc_rf="DF_U64"\n", DP_UUID(pool_uuid), ntargets, ndomains, svc_rf_entry->dpe_val); - rc = select_svc_ranks(svc_rf_entry->dpe_val, target_addrs, ndomains, domains, &ranks); + rc = select_svc_ranks(svc_rf_entry->dpe_val, map_buf, map_version, &ranks); if (rc != 0) - D_GOTO(out, rc); + goto out_map_buf; d_iov_set(&psid, (void *)pool_uuid, sizeof(uuid_t)); rc = ds_rsvc_dist_start(DS_RSVC_CLASS_POOL, &psid, pool_uuid, ranks, RDB_NIL_TERM, @@ -984,6 +969,7 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, DL_ERROR(rc, DF_UUID ": failed to create POOL_CREATE RPC", DP_UUID(pool_uuid)); goto out_backoff_seq; } + /* We could send map_buf to simplify things. */ pool_create_in_set_data(rpc, target_addrs, prop, ndomains, ntargets, domains); /* Send the POOL_CREATE request. */ @@ -1020,6 +1006,8 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, */ out_ranks: d_rank_list_free(ranks); +out_map_buf: + D_FREE(map_buf); out: return rc; } @@ -6231,8 +6219,8 @@ pool_svc_reconf_ult(void *varg) if (arg->sca_map == NULL) ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - rc = ds_pool_plan_svc_reconfs(svc->ps_svc_rf, map, current, dss_self_rank(), &to_add, - &to_remove); + rc = ds_pool_plan_svc_reconfs(svc->ps_svc_rf, map, current, dss_self_rank(), + arg->sca_sync_remove /* filter_only */, &to_add, &to_remove); if (arg->sca_map == NULL) ABT_rwlock_unlock(svc->ps_pool->sp_lock); if (rc != 0) { @@ -6289,9 +6277,6 @@ pool_svc_reconf_ult(void *varg) } if (rdb_get_ranks(svc->ps_rsvc.s_db, &new) == 0) { - d_rank_list_sort(current); - d_rank_list_sort(new); - if (svc->ps_force_notify || !d_rank_list_identical(new, current)) { int rc_tmp; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index a04021d9fb8..8dce5ae8661 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -124,32 +124,21 @@ gc_rate_ctl(void *arg) struct ds_pool_child *child = (struct ds_pool_child *)arg; struct ds_pool *pool = child->spc_pool; struct sched_request *req = child->spc_gc_req; + uint32_t msecs; if (dss_ult_exiting(req)) return -1; - /* Let GC ULT run in tight mode when system is idle */ - if (!dss_xstream_is_busy()) { + /* Let GC ULT run in tight mode when system is idle or under space pressure */ + if (!dss_xstream_is_busy() || sched_req_space_check(req) != SCHED_SPACE_PRESS_NONE) { sched_req_yield(req); return 0; } - /* - * When it's under space pressure, GC will continue run in slack mode - * no matter what reclaim policy is used, otherwise, it'll take an extra - * sleep to minimize the performance impact. - */ - if (sched_req_space_check(req) == SCHED_SPACE_PRESS_NONE) { - uint32_t msecs; - - msecs = (pool->sp_reclaim == DAOS_RECLAIM_LAZY || - pool->sp_reclaim == DAOS_RECLAIM_DISABLED) ? 2000 : 50; - sched_req_sleep(req, msecs); - } else { - sched_req_yield(req); - } - - /* Let GC ULT run in slack mode when system is busy */ + msecs = (pool->sp_reclaim == DAOS_RECLAIM_LAZY || + pool->sp_reclaim == DAOS_RECLAIM_DISABLED) ? 1000 : 50; + sched_req_sleep(req, msecs); + /* Let GC ULT run in slack mode when system is busy and no space pressure */ return 1; } @@ -1404,10 +1393,12 @@ pool_query_one(void *vin) static int pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) { - struct dss_coll_ops coll_ops; - struct dss_coll_args coll_args = { 0 }; - struct pool_query_xs_arg agg_arg = { 0 }; - int rc; + struct dss_coll_ops coll_ops; + struct dss_coll_args coll_args = { 0 }; + struct pool_query_xs_arg agg_arg = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; D_ASSERT(ps != NULL); memset(ps, 0, sizeof(*ps)); @@ -1425,24 +1416,32 @@ pool_tgt_query(struct ds_pool *pool, struct daos_pool_space *ps) coll_args.ca_aggregator = &agg_arg; coll_args.ca_func_args = &coll_args.ca_stream_args; - rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_failed_tgt_idx(pool->sp_uuid, &exclude_tgts, &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID": failed to get index : rc "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; + } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, &coll_args.ca_tgt_bitmap, + &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto out; } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, 0); - D_FREE(coll_args.ca_exclude_tgts); - if (rc) { + if (rc != 0) { D_ERROR("Pool query on pool "DF_UUID" failed, "DF_RC"\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); - return rc; + goto out; } *ps = agg_arg.qxa_space; + +out: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); return rc; } @@ -1615,6 +1614,7 @@ update_child_map(void *data) return 0; } + ds_cont_child_reset_ec_agg_eph_all(child); child->spc_map_version = pool->sp_map_version; ds_pool_child_put(child); return 0; @@ -2169,9 +2169,11 @@ ds_pool_tgt_discard_ult(void *data) { struct ds_pool *pool; struct tgt_discard_arg *arg = data; - struct dss_coll_ops coll_ops = { 0 }; - struct dss_coll_args coll_args = { 0 }; - int rc; + struct dss_coll_ops coll_ops = { 0 }; + struct dss_coll_args coll_args = { 0 }; + int *exclude_tgts = NULL; + uint32_t exclude_tgt_nr = 0; + int rc = 0; /* If discard failed, let's still go ahead, since reintegration might * still succeed, though it might leave some garbage on the reintegration @@ -2194,21 +2196,28 @@ ds_pool_tgt_discard_ult(void *data) */ status = PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN | PO_COMP_ST_DOWN | PO_COMP_ST_NEW; - rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, - &coll_args.ca_exclude_tgts, - &coll_args.ca_exclude_tgts_cnt); - if (rc) { + rc = ds_pool_get_tgt_idx_by_state(arg->pool_uuid, status, &exclude_tgts, + &exclude_tgt_nr); + if (rc != 0) { D_ERROR(DF_UUID "failed to get index : rc "DF_RC"\n", DP_UUID(arg->pool_uuid), DP_RC(rc)); D_GOTO(put, rc); } + + if (exclude_tgts != NULL) { + rc = dss_build_coll_bitmap(exclude_tgts, exclude_tgt_nr, + &coll_args.ca_tgt_bitmap, &coll_args.ca_tgt_bitmap_sz); + if (rc != 0) + goto put; + } } rc = dss_thread_collective_reduce(&coll_ops, &coll_args, DSS_ULT_DEEP_STACK); - if (coll_args.ca_exclude_tgts) - D_FREE(coll_args.ca_exclude_tgts); DL_CDEBUG(rc == 0, DB_MD, DLOG_ERR, rc, DF_UUID " tgt discard", DP_UUID(arg->pool_uuid)); + put: + D_FREE(coll_args.ca_tgt_bitmap); + D_FREE(exclude_tgts); pool->sp_need_discard = 0; pool->sp_discard_status = rc; diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index 9a8e00c21b1..9df333e16c1 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -220,146 +220,451 @@ ds_pool_transfer_map_buf(struct pool_buf *map_buf, uint32_t map_version, return rc; } -/* Move a rank that is not exception from the end of src to the end of dst. */ +/* + * Compute the PS reconfiguration objective, that is, the number of replicas we + * want to achieve. + */ static int -move_rank_except_for(d_rank_t exception, d_rank_list_t *src, d_rank_list_t *dst) +compute_svc_reconf_objective(int svc_rf, d_rank_list_t *replicas) { - int i; - int rc; + /* + * If the PS RF is unknown, we choose the greater one between the + * default PS RF and the one implied by the current number of replicas. + */ + if (svc_rf < 0) { + svc_rf = ds_pool_svc_rf_from_nreplicas(replicas->rl_nr); + if (svc_rf < DAOS_PROP_PO_SVC_REDUN_FAC_DEFAULT) + svc_rf = DAOS_PROP_PO_SVC_REDUN_FAC_DEFAULT; + } - /* Choose the last rank that is not exception in src. */ - if (src->rl_nr == 0) - return -DER_NONEXIST; - i = src->rl_nr - 1; - if (src->rl_ranks[i] == exception) - i--; - if (i < 0) - return -DER_NONEXIST; - D_ASSERT(src->rl_ranks[i] != exception); + return ds_pool_svc_rf_to_nreplicas(svc_rf); +} + +static void +rank_list_del_at(d_rank_list_t *list, int index) +{ + D_ASSERTF(0 <= index && index < list->rl_nr, "index=%d rl_nr=%u\n", index, list->rl_nr); + memmove(&list->rl_ranks[index], &list->rl_ranks[index + 1], + (list->rl_nr - index - 1) * sizeof(list->rl_ranks[0])); + list->rl_nr--; +} + +/* + * Ephermal "reconfiguration domain" used by ds_pool_plan_svc_reconfs to track + * aspects of domains that include at least one engine in POOL_SVC_MAP_STATES. + * + * The rcd_n_replicas field is the number of replicas in this domain. + * + * The rcd_n_engines field is the number of POOL_SVC_MAP_STATES engines. + * + * The number of vacant engines is therefore rcd_n_engines - rcd_n_replicas. We + * always have 0 <= rcd_n_replicas <= rcd_n_engines and rcd_n_engines > 0. + */ +struct reconf_domain { + struct pool_domain *rcd_domain; + int rcd_n_replicas; + int rcd_n_engines; +}; + +/* + * Ephemeral "reconfiguration map" used by ds_pool_plan_svc_reconfs to track + * aspects of the pool map and the replicas. + * + * The rcm_domains field points to a shuffle of all domains that include at + * least one engine in POOL_SVC_MAP_STATES. + * + * The rcm_domains_n_engines_max field stores the maximum of the rcd_n_engines + * field across rcm_domains. + * + * The rcm_replicas field points to a rank list of all replicas underneath + * rcm_domains. + */ +struct reconf_map { + struct reconf_domain *rcm_domains; + int rcm_domains_len; + int rcm_domains_n_engines_max; + d_rank_list_t *rcm_replicas; +}; - /* Add it to dst first, as this may return an error. */ - rc = d_rank_list_append(dst, src->rl_ranks[i]); +/* + * Given map and replicas, initialize rmap_out, and append all undesired + * replicas to to_remove. + */ +static int +init_reconf_map(struct pool_map *map, d_rank_list_t *replicas, d_rank_t self, + struct reconf_map *rmap_out, d_rank_list_t *to_remove) +{ + struct reconf_map rmap = {0}; + struct pool_domain *domains; + int domains_len; + d_rank_list_t *replicas_left = NULL; + int i; + int rc; + + domains_len = pool_map_find_domain(map, PO_COMP_TP_NODE, PO_COMP_ID_ALL, &domains); + D_ASSERTF(domains_len > 0, "pool_map_find_domain: %d\n", domains_len); + + D_ALLOC_ARRAY(rmap.rcm_domains, domains_len); + if (rmap.rcm_domains == NULL) { + rc = -DER_NOMEM; + goto out; + } + + rmap.rcm_replicas = d_rank_list_alloc(0); + if (rmap.rcm_replicas == NULL) { + rc = -DER_NOMEM; + goto out; + } + + /* + * Use a duplicate so that we can delete the replica rank from it + * whenever we find a replica. This can speed up future iteration of + * the following loop, and leave us with a list of replicas that are + * outside of the pool map. + */ + rc = d_rank_list_dup(&replicas_left, replicas); if (rc != 0) - return rc; + goto out; - /* Remove it from src. */ - if (i < src->rl_nr - 1) - src->rl_ranks[i] = src->rl_ranks[src->rl_nr - 1]; - src->rl_nr--; + /* + * Go through all PO_COMP_TP_NODE domains and their engines in the pool + * map, in order to populate rmap and to_remove. + */ + for (i = 0; i < domains_len; i++) { + struct pool_domain *domain = &domains[i]; + int j; + int n_engines = 0; + int n_replicas = 0; + + for (j = 0; j < domain->do_comp.co_nr; j++) { + struct pool_domain *engine = &domain->do_children[j]; + bool is_desired; + int k; + d_rank_list_t *list; + + is_desired = engine->do_comp.co_status & POOL_SVC_MAP_STATES; + if (is_desired) + n_engines++; + + if (!d_rank_list_find(replicas_left, engine->do_comp.co_rank, &k)) + continue; + + rank_list_del_at(replicas_left, k); + if (is_desired) { + list = rmap.rcm_replicas; + n_replicas++; + } else { + list = to_remove; + if (engine->do_comp.co_rank == self) { + D_ERROR("self undesired: state=%x\n", + engine->do_comp.co_status); + rc = -DER_INVAL; + goto out; + } + } + rc = d_rank_list_append(list, engine->do_comp.co_rank); + if (rc != 0) + goto out; + } - return 0; + /* If a domain has no desired engine, we won't consider it. */ + if (n_engines == 0) + continue; + + /* Add this domain to rmap. */ + rmap.rcm_domains[rmap.rcm_domains_len].rcd_domain = domain; + rmap.rcm_domains[rmap.rcm_domains_len].rcd_n_engines = n_engines; + rmap.rcm_domains[rmap.rcm_domains_len].rcd_n_replicas = n_replicas; + rmap.rcm_domains_len++; + if (n_engines > rmap.rcm_domains_n_engines_max) + rmap.rcm_domains_n_engines_max = n_engines; + } + + /* + * Hypothetically, if there are replicas that are not found in the pool + * map, put them in to_remove. + */ + for (i = 0; i < replicas_left->rl_nr; i++) { + rc = d_rank_list_append(to_remove, replicas_left->rl_ranks[i]); + if (rc != 0) + goto out; + } + + /* Shuffle rmap.rcm_domains for randomness in replica placement. */ + for (i = 0; i < rmap.rcm_domains_len; i++) { + int j = i + d_randn(rmap.rcm_domains_len - i); + + D_ASSERTF(i <= j && j < rmap.rcm_domains_len, "i=%d j=%d len=%d\n", i, j, + rmap.rcm_domains_len); + if (j != i) { + struct reconf_domain t = rmap.rcm_domains[i]; + + rmap.rcm_domains[i] = rmap.rcm_domains[j]; + rmap.rcm_domains[j] = t; + } + } + + rc = 0; +out: + d_rank_list_free(replicas_left); + if (rc == 0) { + *rmap_out = rmap; + } else { + d_rank_list_free(rmap.rcm_replicas); + D_FREE(rmap.rcm_domains); + } + return rc; } -#if 0 /* unit tests for move_rank_except_for */ -void -ds_pool_test_move_rank_except_for(void) +static void +fini_reconf_map(struct reconf_map *rmap) { - d_rank_list_t src; - d_rank_list_t dst; - int rc; + d_rank_list_free(rmap->rcm_replicas); + D_FREE(rmap->rcm_domains); +} - { - src.rl_ranks = NULL; - src.rl_nr = 0; - dst.rl_ranks = NULL; - dst.rl_nr = 0; - rc = move_rank_except_for(CRT_NO_RANK, &src, &dst); - D_ASSERT(rc == -DER_NONEXIST); +/* Find in rdomain a random engine that is not in replicas. */ +static d_rank_t +find_vacancy_in_domain(struct reconf_domain *rdomain, d_rank_list_t *replicas) +{ + int n = rdomain->rcd_n_engines - rdomain->rcd_n_replicas; + int i; + + D_ASSERTF(n >= 0, "invalid n: %d: rcd_n_engines=%d rcd_n_replicas=%d\n", n, + rdomain->rcd_n_engines, rdomain->rcd_n_replicas); + if (n == 0) + return CRT_NO_RANK; + + for (i = 0; i < rdomain->rcd_domain->do_comp.co_nr; i++) { + struct pool_domain *engine = &rdomain->rcd_domain->do_children[i]; + + if ((engine->do_comp.co_status & POOL_SVC_MAP_STATES) && + !d_rank_list_find(replicas, engine->do_comp.co_rank, NULL /* idx */)) { + /* Pick this vacant engine with a probability of 1/n. */ + if (d_randn(n) == 0) + return engine->do_comp.co_rank; + n--; + } } - { - d_rank_t src_ranks[] = {0}; + return CRT_NO_RANK; +} - src.rl_ranks = src_ranks; - src.rl_nr = 1; - dst.rl_ranks = NULL; - dst.rl_nr = 0; - rc = move_rank_except_for(0, &src, &dst); - D_ASSERT(rc == -DER_NONEXIST); +/* Find in rdomain a random engine that is in replicas but not self. */ +static d_rank_t +find_replica_in_domain(struct reconf_domain *rdomain, d_rank_list_t *replicas, d_rank_t self) +{ + int n = rdomain->rcd_n_replicas; + int i; + + /* If ourself is in this domain, decrement n. */ + for (i = 0; i < rdomain->rcd_domain->do_comp.co_nr; i++) { + struct pool_domain *engine = &rdomain->rcd_domain->do_children[i]; + + if (engine->do_comp.co_rank == self) { + n--; + break; + } } - { - d_rank_t src_ranks[] = {2}; + D_ASSERTF(n >= 0, "invalid n: %d: rcd_n_engines=%d rcd_n_replicas=%d rl_nr=%d self=%u\n", n, + rdomain->rcd_n_engines, rdomain->rcd_n_replicas, replicas->rl_nr, self); + if (n == 0) + return CRT_NO_RANK; - src.rl_ranks = src_ranks; - src.rl_nr = 1; - dst.rl_ranks = NULL; - dst.rl_nr = 0; - rc = move_rank_except_for(CRT_NO_RANK, &src, &dst); - D_ASSERT(rc == 0); - D_ASSERT(src.rl_nr == 0); - D_ASSERT(dst.rl_nr == 1); - D_ASSERT(dst.rl_ranks[0] == 2); - D_FREE(dst.rl_ranks); + for (i = 0; i < rdomain->rcd_domain->do_comp.co_nr; i++) { + struct pool_domain *engine = &rdomain->rcd_domain->do_children[i]; + + if ((engine->do_comp.co_status & POOL_SVC_MAP_STATES) && + engine->do_comp.co_rank != self && + d_rank_list_find(replicas, engine->do_comp.co_rank, NULL /* idx */)) { + if (d_randn(n) == 0) + return engine->do_comp.co_rank; + n--; + } } - { - d_rank_t src_ranks[] = {2, 5}; + return CRT_NO_RANK; +} - src.rl_ranks = src_ranks; - src.rl_nr = 2; - dst.rl_ranks = NULL; - dst.rl_nr = 0; - rc = move_rank_except_for(5, &src, &dst); - D_ASSERT(rc == 0); - D_ASSERT(src.rl_nr == 1); - D_ASSERT(src.rl_ranks[0] == 5); - D_ASSERT(dst.rl_nr == 1); - D_ASSERT(dst.rl_ranks[0] == 2); - D_FREE(dst.rl_ranks); +/* + * Find engines for at most n replicas, and append their ranks to to_add. + * Return the number of ranks appended or an error. If not zero, the + * domains_n_engines_max parameter overrides + * rmap->rcm_domains_n_engines_max. + */ +static int +add_replicas(int n, struct reconf_map *rmap, int domains_n_engines_max, d_rank_list_t *to_add) +{ + int n_appended = 0; + int i; + + D_ASSERTF(n > 0, "invalid n: %d\n", n); + D_ASSERTF(0 <= domains_n_engines_max && + domains_n_engines_max <= rmap->rcm_domains_n_engines_max, + "invalid domains_n_engines_max: %d: rcm_domains_n_engines_max=%d\n", + domains_n_engines_max, rmap->rcm_domains_n_engines_max); + + if (domains_n_engines_max == 0) + domains_n_engines_max = rmap->rcm_domains_n_engines_max; + + /* We start from domains with least replicas. */ + for (i = 0; i < domains_n_engines_max; i++) { + int j; + + /* For each domain with i replicas and more than i engines... */ + for (j = 0; j < rmap->rcm_domains_len; j++) { + struct reconf_domain *rdomain = &rmap->rcm_domains[j]; + d_rank_t rank; + int rc; + + if (rdomain->rcd_n_replicas != i || + rdomain->rcd_n_replicas == rdomain->rcd_n_engines) + continue; + + /* This domain has at least one vacant engine. */ + rank = find_vacancy_in_domain(rdomain, rmap->rcm_replicas); + D_ASSERT(rank != CRT_NO_RANK); + + rc = d_rank_list_append(to_add, rank); + if (rc != 0) + return rc; + rc = d_rank_list_append(rmap->rcm_replicas, rank); + if (rc != 0) + return rc; + + rdomain->rcd_n_replicas++; + n_appended++; + if (n_appended == n) + return n; + } } + + return n_appended; +} + +static int +remove_replica_in_domain(struct reconf_domain *rdomain, d_rank_list_t *replicas, d_rank_t self, + d_rank_list_t *to_remove) +{ + d_rank_t rank; + int k; + bool found; + int rc; + + rank = find_replica_in_domain(rdomain, replicas, self); + if (rank == CRT_NO_RANK) + return -DER_NONEXIST; + + rc = d_rank_list_append(to_remove, rank); + if (rc != 0) + return rc; + + found = d_rank_list_find(replicas, rank, &k); + D_ASSERT(found); + rank_list_del_at(replicas, k); + + rdomain->rcd_n_replicas--; + return 0; } -#endif /* - * Compute the PS reconfiguration objective, that is, the number of replicas we - * want to achieve. + * Find at most n replicas and append their ranks to to_remove. Return the + * number of ranks appended or an error. */ static int -compute_svc_reconf_objective(int svc_rf, d_rank_list_t *replicas) +remove_replicas(int n, struct reconf_map *rmap, d_rank_t self, d_rank_list_t *to_remove) { + int n_appended = 0; + int i; + + D_ASSERTF(n > 0, "invalid n: %d\n", n); + /* - * If the PS RF is unknown, we choose the greater one between the - * default PS RF and the one implied by the current number of replicas. + * We start from domains with most replicas, so that the subsequent + * balance_replicas call will produce less reconfigurations. The + * algorithm here could perhaps be improved by maintaining an + * rcd_n_replicas-sorted array of domains that each has at least one + * replica. */ - if (svc_rf < 0) { - svc_rf = ds_pool_svc_rf_from_nreplicas(replicas->rl_nr); - if (svc_rf < DAOS_PROP_PO_SVC_REDUN_FAC_DEFAULT) - svc_rf = DAOS_PROP_PO_SVC_REDUN_FAC_DEFAULT; + for (i = rmap->rcm_domains_n_engines_max; i > 0; i--) { + int j; + + /* For each domain with i replicas... */ + for (j = 0; j < rmap->rcm_domains_len; j++) { + struct reconf_domain *rdomain = &rmap->rcm_domains[j]; + int rc; + + if (rdomain->rcd_n_replicas != i) + continue; + + rc = remove_replica_in_domain(rdomain, rmap->rcm_replicas, self, to_remove); + if (rc == -DER_NONEXIST) + continue; + else if (rc != 0) + return rc; + + n_appended++; + if (n_appended == n) + return n; + } } - return ds_pool_svc_rf_to_nreplicas(svc_rf); + return n_appended; } /* - * Find n ranks with states in nodes but not in blacklist, and append them to - * list. Return the number of ranks appended or an error. + * Move replicas so that if there is a domain with i replicas, then all domains + * with less than i - 1 replicas are full. */ static int -find_ranks(int n, pool_comp_state_t states, struct pool_domain *nodes, int nnodes, - d_rank_list_t *blacklist, d_rank_list_t *list) +balance_replicas(struct reconf_map *rmap, d_rank_t self, d_rank_list_t *to_add, + d_rank_list_t *to_remove) { - int n_appended = 0; - int i; - int rc; + int i; - if (n == 0) - return 0; + /* + * We start from domains with most replicas. Since moving a replica + * from a domain with only one replica does not make sense (see + * below), we stop when i == 1. The algorithm here could perhaps be + * improved in a similar manner as remove_replicas. + */ + for (i = rmap->rcm_domains_n_engines_max; i > 1; i--) { + int j; - for (i = 0; i < nnodes; i++) { - if (!(nodes[i].do_comp.co_status & states)) - continue; - if (d_rank_list_find(blacklist, nodes[i].do_comp.co_rank, NULL /* idx */)) - continue; - rc = d_rank_list_append(list, nodes[i].do_comp.co_rank); - if (rc != 0) - return rc; - n_appended++; - if (n_appended == n) - break; + /* For each domain with i replicas... */ + for (j = 0; j < rmap->rcm_domains_len; j++) { + struct reconf_domain *rdomain = &rmap->rcm_domains[j]; + int rc; + + if (rdomain->rcd_n_replicas != i) + continue; + + /* + * Try to add a replica with add_replicas, such that + * the domain containing the new replica will have at + * most i - 1 replicas. Since i > 1, we know that + * i - 1 > 0. + */ + rc = add_replicas(1, rmap, i - 1 /* domains_n_engines_max */, to_add); + if (rc < 0) + return rc; + else if (rc == 0) + continue; + + /* + * Remove a replica from the current domain. Since + * i > 1, there must be at least one replica we can + * remove. + */ + rc = remove_replica_in_domain(rdomain, rmap->rcm_replicas, self, to_remove); + D_ASSERT(rc != -DER_NONEXIST); + if (rc != 0) + return rc; + } } - return n_appended; + return 0; } /** @@ -384,83 +689,58 @@ find_ranks(int n, pool_comp_state_t states, struct pool_domain *nodes, int nnode * * \param[in] svc_rf PS redundancy factor * \param[in] map pool map - * \param[in] replicas current PS membership - * \param[in] self self rank + * \param[in] replicas current PS membership (may be empty if \a svc_rf >= 0) + * \param[in] self self replica rank (may be CRT_NO_RANK if we're not a replica) + * \param[in] filter_only only filter out replicas not in POOL_SVC_MAP_STATES * \param[out] to_add_out PS replicas to add * \param[out] to_remove_out PS replicas to remove */ int ds_pool_plan_svc_reconfs(int svc_rf, struct pool_map *map, d_rank_list_t *replicas, d_rank_t self, - d_rank_list_t **to_add_out, d_rank_list_t **to_remove_out) + bool filter_only, d_rank_list_t **to_add_out, + d_rank_list_t **to_remove_out) { - struct pool_domain *nodes = NULL; - int nnodes; int objective; - d_rank_list_t *desired = NULL; d_rank_list_t *to_add = NULL; d_rank_list_t *to_remove = NULL; - int i; + struct reconf_map rmap; + int n; int rc; - nnodes = pool_map_find_nodes(map, PO_COMP_ID_ALL, &nodes); - D_ASSERTF(nnodes > 0, "pool_map_find_nodes: %d\n", nnodes); - objective = compute_svc_reconf_objective(svc_rf, replicas); - desired = d_rank_list_alloc(0); to_add = d_rank_list_alloc(0); to_remove = d_rank_list_alloc(0); - if (desired == NULL || to_add == NULL || to_remove == NULL) { + if (to_add == NULL || to_remove == NULL) { rc = -DER_NOMEM; goto out; } - /* Classify replicas into desired and to_remove. */ - for (i = 0; i < replicas->rl_nr; i++) { - d_rank_t rank = replicas->rl_ranks[i]; - d_rank_list_t *list; - int j; + rc = init_reconf_map(map, replicas, self, &rmap, to_remove); + if (rc != 0) + goto out; - for (j = 0; j < nnodes; j++) - if (nodes[j].do_comp.co_rank == rank) - break; - if (j == nnodes) /* not found (hypothetical) */ - list = to_remove; - else if (nodes[j].do_comp.co_status & POOL_SVC_MAP_STATES) - list = desired; - else - list = to_remove; - if (rank == self && list == to_remove) { - D_ERROR("self undesired: state=%x\n", - j < nnodes ? nodes[j].do_comp.co_status : -1); - rc = -DER_INVAL; - goto out; - } - rc = d_rank_list_append(list, rank); - if (rc != 0) - goto out; + D_DEBUG(DB_MD, "domains=%d n_engines_max=%d replicas=%u remove=%u filter_only=%d\n", + rmap.rcm_domains_len, rmap.rcm_domains_n_engines_max, rmap.rcm_replicas->rl_nr, + to_remove->rl_nr, filter_only); + + if (filter_only) { + rc = 0; + goto out_rmap; } - D_DEBUG(DB_MD, "desired=%u undesired=%u objective=%d\n", desired->rl_nr, to_remove->rl_nr, - objective); + n = rmap.rcm_replicas->rl_nr - objective; + if (n < 0) + rc = add_replicas(-n, &rmap, 0 /* domains_n_engines_max */, to_add); + else if (n > 0) + rc = remove_replicas(n, &rmap, self, to_remove); + if (rc < 0) + goto out_rmap; - if (desired->rl_nr > objective) { - /* Too many replicas, remove one by one. */ - do { - rc = move_rank_except_for(self, desired, to_remove); - D_ASSERT(rc != -DER_NONEXIST); - if (rc != 0) - goto out; - } while (desired->rl_nr > objective); - } else if (desired->rl_nr < objective) { - /* Too few replicas, add some. */ - rc = find_ranks(objective - desired->rl_nr, POOL_SVC_MAP_STATES, nodes, nnodes, - desired, to_add); - if (rc < 0) - goto out; - } + rc = balance_replicas(&rmap, self, to_add, to_remove); - rc = 0; +out_rmap: + fini_reconf_map(&rmap); out: if (rc == 0) { *to_add_out = to_add; @@ -469,7 +749,6 @@ ds_pool_plan_svc_reconfs(int svc_rf, struct pool_map *map, d_rank_list_t *replic d_rank_list_free(to_remove); d_rank_list_free(to_add); } - d_rank_list_free(desired); return rc; } @@ -504,27 +783,144 @@ testu_rank_sets_belong(d_rank_list_t *x, d_rank_t *y_ranks, int y_ranks_len) return true; } +/* Add all ranks in y to x. Just append; no dup check. */ +static d_rank_list_t * +testu_rank_sets_add(d_rank_t *x_ranks, int x_ranks_len, d_rank_list_t *y) +{ + d_rank_list_t *z; + int i; + int rc; + + z = d_rank_list_alloc(0); + D_ASSERT(z != NULL); + + for (i = 0; i < x_ranks_len; i++) { + rc = d_rank_list_append(z, x_ranks[i]); + D_ASSERT(rc == 0); + } + + for (i = 0; i < y->rl_nr; i++) { + rc = d_rank_list_append(z, y->rl_ranks[i]); + D_ASSERT(rc == 0); + } + + return z; +} + +/* Subtract all ranks in y from x. */ +static d_rank_list_t * +testu_rank_sets_subtract(d_rank_t *x_ranks, int x_ranks_len, d_rank_list_t *y) +{ + d_rank_list_t *z; + int i; + int rc; + + z = d_rank_list_alloc(0); + D_ASSERT(z != NULL); + + for (i = 0; i < x_ranks_len; i++) { + rc = d_rank_list_append(z, x_ranks[i]); + D_ASSERT(rc == 0); + } + + for (i = 0; i < y->rl_nr; i++) { + int j; + + if (d_rank_list_find(z, y->rl_ranks[i], &j)) + rank_list_del_at(z, j); + } + + return z; +} + +static int +testu_cmp_ints(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static bool +testu_rank_set_has_dist(d_rank_list_t *ranks, int *dist, int dist_len) +{ + int *counts; + int i; + + D_ALLOC_ARRAY(counts, dist_len); + D_ASSERT(counts != NULL); + for (i = 0; i < ranks->rl_nr; i++) { + int domain = ranks->rl_ranks[i] / 10; + + D_ASSERT(0 <= domain && domain < dist_len); + counts[domain]++; + } + + qsort(dist, dist_len, sizeof(int), testu_cmp_ints); + qsort(counts, dist_len, sizeof(int), testu_cmp_ints); + return memcmp(counts, dist, dist_len * sizeof(int)) == 0; +} + +static void +testu_create_domain_buf(d_rank_t *ranks, int n_ranks, uint32_t **domain_buf_out, + int *domain_buf_len_out) +{ + int n_domains = ranks[n_ranks - 1] / 10 + 1; + int n_ranks_per_domain = ranks[n_ranks - 1] % 10 + 1; + uint32_t *domain_buf; + int domain_buf_len; + uint32_t *p; + int i; + + D_ASSERT(n_domains * n_ranks_per_domain == n_ranks); + + domain_buf_len = 3 /* root */ + 3 * n_domains + n_ranks; + D_ALLOC_ARRAY(domain_buf, domain_buf_len); + D_ASSERT(domain_buf != NULL); + + /* The root. */ + p = domain_buf; + D_ASSERT(domain_buf <= p && p + 3 <= domain_buf + domain_buf_len); + p[0] = 2; + p[1] = 0; + p[2] = n_domains; + p += 3; + + /* The domains. */ + for (i = 0; i < n_domains; i++) { + D_ASSERT(domain_buf <= p && p + 3 <= domain_buf + domain_buf_len); + p[0] = 1; + p[1] = i; + p[2] = n_ranks_per_domain; + p += 3; + } + + /* The ranks. */ + for (i = 0; i < n_ranks; i++) { + D_ASSERT(domain_buf <= p && p + 1 <= domain_buf + domain_buf_len); + p[0] = ranks[i]; + p++; + } + + for (i = 0; i < domain_buf_len; i++) + D_INFO("domain_buf[%d]: %u\n", i, domain_buf[i]); + + *domain_buf_out = domain_buf; + *domain_buf_len_out = domain_buf_len; +} + static struct pool_map * testu_create_pool_map(d_rank_t *ranks, int n_ranks, d_rank_t *down_ranks, int n_down_ranks) { struct pool_buf *map_buf; struct pool_map *map; - uint32_t *domains; - int n_domains = 3 + n_ranks; + uint32_t *domain_buf; + int domain_buf_len; int i; int rc; - /* Not using domains properly at the moment. See FD_TREE_TUNPLE_LEN. */ - D_ALLOC_ARRAY(domains, n_domains); - D_ASSERT(domains != NULL); - domains[0] = 1; - domains[1] = 0; - domains[2] = n_ranks; - for (i = 0; i < n_ranks; i++) - domains[3 + i] = i; - - rc = gen_pool_buf(NULL /* map */, &map_buf, 1 /* map_version */, n_domains, - n_ranks, n_ranks * 1 /* ntargets */, domains, 1 /* dss_tgt_nr */); + testu_create_domain_buf(ranks, n_ranks, &domain_buf, &domain_buf_len); + + rc = gen_pool_buf(NULL /* map */, &map_buf, 1 /* map_version */, domain_buf_len, + n_ranks, n_ranks * 1 /* ntargets */, domain_buf, 1 /* dss_tgt_nr */); D_ASSERT(rc == 0); rc = pool_map_create(map_buf, 1, &map); @@ -539,7 +935,7 @@ testu_create_pool_map(d_rank_t *ranks, int n_ranks, d_rank_t *down_ranks, int n_ } pool_buf_free(map_buf); - D_FREE(domains); + D_FREE(domain_buf); return map; } @@ -558,7 +954,7 @@ testu_plan_svc_reconfs(int svc_rf, d_rank_t ranks[], int n_ranks, d_rank_t down_ replicas_list.rl_ranks = replicas_ranks; replicas_list.rl_nr = n_replicas_ranks; - rc = ds_pool_plan_svc_reconfs(svc_rf, map, &replicas_list, self, to_add, to_remove); + rc = ds_pool_plan_svc_reconfs(svc_rf, map, &replicas_list, self, false, to_add, to_remove); D_ASSERTF(rc == expected_rc, "rc=%d expected_rc=%d\n", rc, expected_rc); pool_map_decref(map); @@ -567,7 +963,6 @@ testu_plan_svc_reconfs(int svc_rf, d_rank_t ranks[], int n_ranks, d_rank_t down_ void ds_pool_test_plan_svc_reconfs(void) { - d_rank_t self = 0; d_rank_list_t *to_add; d_rank_list_t *to_remove; @@ -581,12 +976,78 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_list_free(to_add); \ d_rank_list_free(to_remove); + /* + * We encode domains into ranks: rank / 10 = domain. For example, rank + * 1 is in domain 0, rank 11 is in domain 1, and rank 20 is in domain + * 2. See testu_rank_to_domain. Hence, each domain can have at most 10 + * ranks. + * + * The ranks arrays below must be monotically increasing. + */ + + /* A PS is created one replica per domain. */ + { + int svc_rf = 2; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21, + 30, 31, + 40, 41, + 50, 51 + }; + d_rank_t down_ranks[] = {}; + d_rank_t replicas_ranks[] = {}; + d_rank_t self = CRT_NO_RANK; + int expected_dist[] = {0, 1, 1, 1, 1, 1}; + + call_testu_plan_svc_reconfs(0) + + D_ASSERT(to_add->rl_nr == 5); + D_ASSERT(to_remove->rl_nr == 0); + + D_ASSERT(testu_rank_set_has_dist(to_add, expected_dist, ARRAY_SIZE(expected_dist))); + + call_d_rank_list_free + } + + /* A PS is created multiple replicas per domain. */ + { + int svc_rf = 2; + d_rank_t ranks[] = { + 0, 1, 2, + 10, 11, 12, + 20, 21, 22 + }; + d_rank_t down_ranks[] = {}; + d_rank_t replicas_ranks[] = {}; + d_rank_t self = CRT_NO_RANK; + int expected_dist[] = {1, 2, 2}; + + call_testu_plan_svc_reconfs(0) + + D_ASSERT(to_add->rl_nr == 5); + D_ASSERT(to_remove->rl_nr == 0); + + D_ASSERT(testu_rank_set_has_dist(to_add, expected_dist, ARRAY_SIZE(expected_dist))); + + call_d_rank_list_free + } + /* A happy PS does not want any changes. */ { int svc_rf = 2; - d_rank_t ranks[] = {0, 1, 2, 3, 4, 5, 6, 7}; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21, + 30, 31, + 40, 41, + 50, 51 + }; d_rank_t down_ranks[] = {}; - d_rank_t replicas_ranks[] = {0, 1, 2, 3, 4}; + d_rank_t replicas_ranks[] = {0, 10, 20, 30, 40}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(0) @@ -602,6 +1063,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0, 1, 2}; d_rank_t down_ranks[] = {0}; d_rank_t replicas_ranks[] = {0, 1, 2}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(-DER_INVAL) } @@ -612,6 +1074,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0}; d_rank_t down_ranks[] = {}; d_rank_t replicas_ranks[] = {0}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(0) @@ -628,6 +1091,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t down_ranks[] = {}; d_rank_t replicas_ranks[] = {0}; d_rank_t expected_to_add[] = {1}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(0) @@ -645,6 +1109,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t down_ranks[] = {2}; d_rank_t replicas_ranks[] = {0}; d_rank_t expected_to_add[] = {1}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(0) @@ -662,6 +1127,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t down_ranks[] = {}; d_rank_t replicas_ranks[] = {0}; d_rank_t expected_to_add[] = {1, 2}; + d_rank_t self = 0; call_testu_plan_svc_reconfs(0) @@ -672,12 +1138,39 @@ ds_pool_test_plan_svc_reconfs(void) call_d_rank_list_free } + /* A PS successfully achieves the RF, picking the best distribution. */ + { + int svc_rf = 1; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21 + }; + d_rank_t down_ranks[] = {}; + d_rank_t replicas_ranks[] = {0}; + d_rank_t self = 0; + int expected_dist[] = {1, 1, 1}; + d_rank_list_t *new_replicas_ranks; + + call_testu_plan_svc_reconfs(0) + + new_replicas_ranks = testu_rank_sets_add(replicas_ranks, ARRAY_SIZE(replicas_ranks), + to_add); + D_ASSERT(testu_rank_set_has_dist(new_replicas_ranks, expected_dist, + ARRAY_SIZE(expected_dist))); + d_rank_list_free(new_replicas_ranks); + D_ASSERT(to_remove->rl_nr == 0); + + call_d_rank_list_free + } + /* A PS removes the down rank even when there's no replacement. */ { int svc_rf = 1; d_rank_t ranks[] = {0, 1, 2}; d_rank_t down_ranks[] = {2}; d_rank_t replicas_ranks[] = {0, 1, 2}; + d_rank_t self = 0; d_rank_t expected_to_remove[] = {2}; call_testu_plan_svc_reconfs(0) @@ -692,17 +1185,21 @@ ds_pool_test_plan_svc_reconfs(void) /* A PS replaces one down rank. */ { int svc_rf = 1; - d_rank_t ranks[] = {0, 1, 2, 3, 4}; - d_rank_t down_ranks[] = {2}; - d_rank_t replicas_ranks[] = {0, 1, 2}; - d_rank_t expected_to_add_candidates[] = {3, 4}; - d_rank_t expected_to_remove[] = {2}; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21 + }; + d_rank_t down_ranks[] = {21}; + d_rank_t replicas_ranks[] = {0, 10, 21}; + d_rank_t self = 0; + d_rank_t expected_to_add[] = {20}; + d_rank_t expected_to_remove[] = {21}; call_testu_plan_svc_reconfs(0) - D_ASSERT(to_add->rl_nr == 1); - D_ASSERT(testu_rank_sets_belong(to_add, expected_to_add_candidates, - ARRAY_SIZE(expected_to_add_candidates))); + D_ASSERT(testu_rank_sets_identical(to_add, expected_to_add, + ARRAY_SIZE(expected_to_add))); D_ASSERT(testu_rank_sets_identical(to_remove, expected_to_remove, ARRAY_SIZE(expected_to_remove))); @@ -718,6 +1215,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0, 1, 2, 3}; d_rank_t down_ranks[] = {}; d_rank_t replicas_ranks[] = {0}; + d_rank_t self = 0; d_rank_t expected_to_add[] = {1, 2, 3}; call_testu_plan_svc_reconfs(0) @@ -735,6 +1233,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0, 1, 2}; d_rank_t down_ranks[] = {}; d_rank_t replicas_ranks[] = {0, 1, 2}; + d_rank_t self = 0; d_rank_t expected_to_remove[] = {1, 2}; call_testu_plan_svc_reconfs(0) @@ -752,6 +1251,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0, 1, 2, 3, 4, 5}; d_rank_t down_ranks[] = {2}; d_rank_t replicas_ranks[] = {0, 1, 2}; + d_rank_t self = 0; d_rank_t expected_to_add[] = {3, 4, 5}; d_rank_t expected_to_remove[] = {2}; @@ -771,6 +1271,7 @@ ds_pool_test_plan_svc_reconfs(void) d_rank_t ranks[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; d_rank_t down_ranks[] = {1, 2, 3}; d_rank_t replicas_ranks[] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + d_rank_t self = 0; d_rank_t expected_to_remove_candidates[] = {1, 2, 3, 4, 5, 6, 7, 8}; d_rank_list_t tmp; @@ -787,12 +1288,41 @@ ds_pool_test_plan_svc_reconfs(void) call_d_rank_list_free } + /* A PS removes from crowded domains first. */ + { + int svc_rf = 1; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21 + }; + d_rank_t down_ranks[] = {}; + d_rank_t replicas_ranks[] = {0, 1, 10, 20, 21}; + d_rank_t self = 0; + int expected_dist[] = {1, 1, 1}; + d_rank_list_t *new_replicas_ranks; + + call_testu_plan_svc_reconfs(0) + + D_ASSERT(to_add->rl_nr == 0); + D_ASSERT(to_remove->rl_nr == 2); + new_replicas_ranks = testu_rank_sets_subtract(replicas_ranks, + ARRAY_SIZE(replicas_ranks), + to_remove); + D_ASSERT(testu_rank_set_has_dist(new_replicas_ranks, expected_dist, + ARRAY_SIZE(expected_dist))); + d_rank_list_free(new_replicas_ranks); + + call_d_rank_list_free + } + /* A shrink that is too complicated to comment on. */ { int svc_rf = 3; d_rank_t ranks[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; d_rank_t down_ranks[] = {1, 3, 5, 7}; d_rank_t replicas_ranks[] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + d_rank_t self = 0; d_rank_t expected_to_add[] = {9}; d_rank_t expected_to_remove[] = {1, 3, 5, 7}; @@ -806,6 +1336,32 @@ ds_pool_test_plan_svc_reconfs(void) call_d_rank_list_free } + /* A PS moves a replica out of a crowded domain. */ + { + int svc_rf = 1; + d_rank_t ranks[] = { + 0, 1, + 10, 11, + 20, 21 + }; + d_rank_t down_ranks[] = {}; + d_rank_t replicas_ranks[] = {0, 1, 10}; + d_rank_t self = 0; + d_rank_t expected_to_add_candidates[] = {20, 21}; + d_rank_t expected_to_remove_candidates[] = {0, 1}; + + call_testu_plan_svc_reconfs(0) + + D_ASSERT(to_add->rl_nr == 1); + D_ASSERT(testu_rank_sets_belong(to_add, expected_to_add_candidates, + ARRAY_SIZE(expected_to_add_candidates))); + D_ASSERT(to_remove->rl_nr == 1); + D_ASSERT(testu_rank_sets_belong(to_remove, expected_to_remove_candidates, + ARRAY_SIZE(expected_to_remove_candidates))); + + call_d_rank_list_free + } + #undef call_d_rank_list_free #undef call_testu_plan_svc_reconfs } diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index 04314bc8d07..8be8dc6ed99 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2419,7 +2419,7 @@ rdb_raft_get_election_timeout(void) unsigned int default_value = 7000; unsigned int value = default_value; - d_getenv_int(name, &value); + d_getenv_uint(name, &value); if (value == 0 || value > INT_MAX) { D_WARN("%s not in (0, %d] (defaulting to %u)\n", name, INT_MAX, default_value); value = default_value; @@ -2434,7 +2434,7 @@ rdb_raft_get_request_timeout(void) unsigned int default_value = 3000; unsigned int value = default_value; - d_getenv_int(name, &value); + d_getenv_uint(name, &value); if (value == 0 || value > INT_MAX) { D_WARN("%s not in (0, %d] (defaulting to %u)\n", name, INT_MAX, default_value); value = default_value; @@ -2449,7 +2449,7 @@ rdb_raft_get_lease_maintenance_grace(void) unsigned int default_value = 7000; unsigned int value = default_value; - d_getenv_int(name, &value); + d_getenv_uint(name, &value); if (value == 0 || value > INT_MAX) { D_WARN("%s not in (0, %d] (defaulting to %u)\n", name, INT_MAX, default_value); value = default_value; @@ -2464,7 +2464,7 @@ rdb_raft_get_compact_thres(void) unsigned int default_value = 256; unsigned int value = default_value; - d_getenv_int(name, &value); + d_getenv_uint(name, &value); if (value == 0) { D_WARN("%s not in (0, %u] (defaulting to %u)\n", name, UINT_MAX, default_value); value = default_value; @@ -2479,7 +2479,7 @@ rdb_raft_get_ae_max_entries(void) unsigned int default_value = 32; unsigned int value = default_value; - d_getenv_int(name, &value); + d_getenv_uint(name, &value); if (value == 0) { D_WARN("%s not in (0, %u] (defaulting to %u)\n", name, UINT_MAX, default_value); value = default_value; @@ -2947,6 +2947,8 @@ rdb_raft_get_ranks(struct rdb *db, d_rank_list_t **ranksp) } ranks->rl_nr = i; + d_rank_list_sort(ranks); + *ranksp = ranks; rc = 0; mutex: diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index f76e578d9f2..4c2c78c4bee 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -402,6 +402,7 @@ struct rebuild_scan_arg { int snapshot_cnt; uint32_t yield_freq; int32_t obj_yield_cnt; + struct ds_cont_child *cont_child; }; /** @@ -696,7 +697,7 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, int i; int rc = 0; - if (rpt->rt_abort) { + if (rpt->rt_abort || arg->cont_child->sc_stopping) { D_DEBUG(DB_REBUILD, "rebuild is aborted\n"); return 1; } @@ -839,35 +840,45 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, } rc = vos_cont_open(iter_param->ip_hdl, entry->ie_couuid, &coh); + if (rc == -DER_NONEXIST) { + D_DEBUG(DB_REBUILD, DF_UUID" already destroyed\n", DP_UUID(arg->co_uuid)); + return 0; + } + if (rc != 0) { D_ERROR("Open container "DF_UUID" failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); return rc; } + rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); + if (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN) { + D_DEBUG(DB_REBUILD, DF_UUID" already destroyed or destroying\n", + DP_UUID(arg->co_uuid)); + rc = 0; + D_GOTO(close, rc); + } + + if (rc != 0) { + D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", + DP_UUID(entry->ie_couuid), DP_RC(rc)); + D_GOTO(close, rc); + } + cont_child->sc_rebuilding = 1; + rc = ds_cont_fetch_snaps(rpt->rt_pool->sp_iv_ns, entry->ie_couuid, NULL, &snapshot_cnt); if (rc) { D_ERROR("Container "DF_UUID", ds_cont_fetch_snaps failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; + D_GOTO(close, rc); } rc = ds_cont_get_props(&arg->co_props, rpt->rt_pool->sp_uuid, entry->ie_couuid); if (rc) { D_ERROR("Container "DF_UUID", ds_cont_get_props failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; - } - - rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); - if (rc != 0) { - D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", - DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; + D_GOTO(close, rc); } /* Wait for EC aggregation to finish. NB: migrate needs to wait for EC aggregation to finish */ @@ -899,6 +910,7 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_flags = VOS_IT_FOR_MIGRATION; uuid_copy(arg->co_uuid, entry->ie_couuid); arg->snapshot_cnt = snapshot_cnt; + arg->cont_child = cont_child; /* If there is no snapshots, then rebuild does not need to migrate * punched objects at all. Ideally, it should ignore any objects @@ -914,8 +926,11 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, close: vos_cont_close(coh); - if (cont_child != NULL) + if (cont_child != NULL) { + cont_child->sc_rebuilding = 0; + ABT_cond_broadcast(cont_child->sc_rebuild_cond); ds_cont_child_put(cont_child); + } D_DEBUG(DB_REBUILD, DF_UUID"/"DF_UUID" iterate cont done: "DF_RC"\n", DP_UUID(rpt->rt_pool_uuid), DP_UUID(entry->ie_couuid), diff --git a/src/tests/ftest/cart/dual_iface_server.c b/src/tests/ftest/cart/dual_iface_server.c index 38e9f79f8ea..21993da7c4b 100644 --- a/src/tests/ftest/cart/dual_iface_server.c +++ b/src/tests/ftest/cart/dual_iface_server.c @@ -242,9 +242,9 @@ server_main(d_rank_t my_rank, const char *str_port, const char *str_interface, struct stat st; crt_init_options_t init_opts = {0}; - setenv("FI_UNIVERSE_SIZE", "1024", 1); - setenv("D_LOG_MASK", "ERR", 1); - setenv("D_PORT_AUTO_ADJUST", "1", 1); + d_setenv("FI_UNIVERSE_SIZE", "1024", 1); + d_setenv("D_LOG_MASK", "ERR", 1); + d_setenv("D_PORT_AUTO_ADJUST", "1", 1); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); diff --git a/src/tests/ftest/cart/utest/utest_portnumber.c b/src/tests/ftest/cart/utest/utest_portnumber.c index 12164f5201d..435182e85af 100644 --- a/src/tests/ftest/cart/utest/utest_portnumber.c +++ b/src/tests/ftest/cart/utest/utest_portnumber.c @@ -41,7 +41,8 @@ #include #include -#include "gurt/debug.h" +#include +#include #define CHILD1_INIT_ERR 10 #define CHILD1_CONTEXT_DESTROY_ERR 11 @@ -191,8 +192,8 @@ run_test_fork(void **state) static void test_port_tcp(void **state) { - setenv("OFI_INTERFACE", "lo", 1); - setenv("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm", 1); + d_setenv("OFI_INTERFACE", "lo", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm", 1); run_test_fork(state); } @@ -200,17 +201,17 @@ test_port_tcp(void **state) static void test_port_sockets(void **state) { - setenv("OFI_INTERFACE", "eth0", 1); - setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); + d_setenv("OFI_INTERFACE", "eth0", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); run_test_fork(state); }; static void test_port_verb(void **state) { - setenv("OFI_INTERFACE", "eth0", 1); - setenv("OFI_DOMAIN", "Must define here", 1); - setenv("CRT_PHY_ADDR_STR", "ofi+verbs;ofi_rxm", 1); + d_setenv("OFI_INTERFACE", "eth0", 1); + d_setenv("OFI_DOMAIN", "Must define here", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+verbs;ofi_rxm", 1); run_test_fork(state); }; #endif @@ -295,10 +296,10 @@ int main(int argc, char **argv) #endif }; - setenv("FI_UNIVERSE_SIZE", "2048", 1); - setenv("FI_OFI_RXM_USE_SRX", "1", 1); - setenv("D_LOG_MASK", "CRIT", 1); - setenv("OFI_PORT", "34571", 1); + d_setenv("FI_UNIVERSE_SIZE", "2048", 1); + d_setenv("FI_OFI_RXM_USE_SRX", "1", 1); + d_setenv("D_LOG_MASK", "CRIT", 1); + d_setenv("OFI_PORT", "34571", 1); d_register_alt_assert(mock_assert); diff --git a/src/tests/ftest/cart/utest/utest_protocol.c b/src/tests/ftest/cart/utest/utest_protocol.c index f6182460089..2e69923005f 100644 --- a/src/tests/ftest/cart/utest/utest_protocol.c +++ b/src/tests/ftest/cart/utest/utest_protocol.c @@ -9,7 +9,7 @@ #include #include #include -#include "gurt/debug.h" +#include #include #include diff --git a/src/tests/ftest/cart/utest/utest_swim.c b/src/tests/ftest/cart/utest/utest_swim.c index 01ccabb2de9..340434224d2 100644 --- a/src/tests/ftest/cart/utest/utest_swim.c +++ b/src/tests/ftest/cart/utest/utest_swim.c @@ -60,8 +60,8 @@ init_tests(void **state) fprintf(stdout, "Seeding this test run with seed=%u\n", seed); srand(seed); - setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); - setenv("OFI_INTERFACE", "lo", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); + d_setenv("OFI_INTERFACE", "lo", 1); return 0; } diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index 9569f2e31ea..38950af94a8 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -37,7 +37,7 @@ def get_checksum_error_value(self, dmg, device_id): for device in devices: try: if device['uuid'] == device_id: - return device['health']['checksum_errs'] + return device['ctrlr']['health_stats']['checksum_errs'] except KeyError as error: self.fail( 'Error parsing dmg storage query device-health output: {}'.format(error)) diff --git a/src/tests/ftest/control/config_generate_run.py b/src/tests/ftest/control/config_generate_run.py index 571e23c103e..b17ad7beb4e 100644 --- a/src/tests/ftest/control/config_generate_run.py +++ b/src/tests/ftest/control/config_generate_run.py @@ -62,7 +62,7 @@ def test_config_generate_run(self): self.fail(f"Error loading dmg generated config! {error}") # Stop and restart daos_server. self.start_server_managers() has the - # server startup check built into it, so if there's something wrong, + # server start-up check built into it, so if there's something wrong, # it'll throw an error. self.log.info("Stopping servers") self.stop_servers() diff --git a/src/tests/ftest/control/config_generate_run.yaml b/src/tests/ftest/control/config_generate_run.yaml index e834d889b73..6a2e9ea2519 100644 --- a/src/tests/ftest/control/config_generate_run.yaml +++ b/src/tests/ftest/control/config_generate_run.yaml @@ -2,6 +2,8 @@ hosts: test_servers: 1 timeout: 250 server_config: + transport_config: + allow_insecure: False engines_per_host: 1 engines: 0: @@ -9,8 +11,17 @@ server_config: 0: class: ram scm_mount: /mnt/daos0 +# Force the use of certificates regardless of the launch.py --insecure setting. pool: control_method: dmg + transport_config: + allow_insecure: False +agent_config: + transport_config: + allow_insecure: False +dmg: + transport_config: + allow_insecure: False setup: start_servers_once: False config_generate_params: !mux diff --git a/src/tests/ftest/control/dmg_system_reformat.py b/src/tests/ftest/control/dmg_system_reformat.py index d20354018c8..887c23bc0f1 100644 --- a/src/tests/ftest/control/dmg_system_reformat.py +++ b/src/tests/ftest/control/dmg_system_reformat.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -8,6 +8,7 @@ from apricot import TestWithServers from avocado.core.exceptions import TestFail from exception_utils import CommandFailure +from general_utils import journalctl_time from test_utils_pool import add_pool, get_size_params @@ -91,6 +92,7 @@ def test_dmg_system_reformat(self): # Check that engine starts up again self.log.info(" Waiting for the engines to start") + self.server_managers[-1].manager.timestamps["start"] = journalctl_time() self.server_managers[-1].detect_engine_start() # Check that we have cleared storage by checking pool list diff --git a/src/tests/ftest/daos_test/rebuild.yaml b/src/tests/ftest/daos_test/rebuild.yaml index 6c1c3f21c4b..e5a59dd0f40 100644 --- a/src/tests/ftest/daos_test/rebuild.yaml +++ b/src/tests/ftest/daos_test/rebuild.yaml @@ -97,8 +97,8 @@ daos_tests: test_rebuild_23: -s3 -u subtests="23" test_rebuild_24: -s3 -u subtests="24" test_rebuild_25: -s5 -u subtests="25" - test_rebuild_26: -s3 -u subtests="26" - test_rebuild_27: -s6 -u subtests="27" + test_rebuild_26: -s5 -u subtests="26" + test_rebuild_27: -s7 -u subtests="27" test_rebuild_28: -s3 -u subtests="28" test_rebuild_29: -s5 -u subtests="29" test_rebuild_30: -s5 -u subtests="30" diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index b853a8c1d89..afcc048f965 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -27,7 +27,7 @@ timeouts: test_daos_extend_simple: 3600 test_daos_oid_allocator: 640 test_daos_checksum: 500 - test_daos_rebuild_ec: 6400 + test_daos_rebuild_ec: 7200 test_daos_aggregate_ec: 200 test_daos_degraded_ec: 1900 test_daos_dedup: 220 @@ -172,6 +172,10 @@ daos_tests: test_daos_ec_io: -l"EC_4P2G1" test_daos_rebuild_ec: -s5 test_daos_md_replication: -s5 + test_daos_degraded_mode: -s7 + test_daos_drain_simple: -s3 + test_daos_extend_simple: -s3 + test_daos_oid_allocator: -s5 scalable_endpoint: test_daos_degraded_mode: true stopped_ranks: diff --git a/src/tests/ftest/deployment/ior_per_rank.py b/src/tests/ftest/deployment/ior_per_rank.py index f914216f326..5a8463cb940 100644 --- a/src/tests/ftest/deployment/ior_per_rank.py +++ b/src/tests/ftest/deployment/ior_per_rank.py @@ -5,6 +5,7 @@ """ from avocado.core.exceptions import TestFail +from ClusterShell.NodeSet import NodeSet from general_utils import DaosTestError, percent_change from ior_test_base import IorTestBase from ior_utils import IorCommand, IorMetrics @@ -32,53 +33,52 @@ def execute_ior_per_rank(self, rank): self.log.info("Running Test on rank: %s", rank) # create the pool on specified rank. self.add_pool(connect=False, target_list=[rank]) + self.container = self.get_container(self.pool) + + host = self.server_managers[0].get_host(rank) + + # execute ior on given rank and collect the results + try: + self.ior_cmd.flags.update(self.write_flags) + dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) + dfs_perf_write = IorCommand.get_ior_metrics(dfs_out) + self.ior_cmd.flags.update(self.read_flags) + dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) + dfs_perf_read = IorCommand.get_ior_metrics(dfs_out) + + # Destroy container, to be sure we use newly created container in next iteration + self.container.destroy() + self.container = None + + # gather actual and expected perf data to be compared + dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB]) + dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB]) + actual_write_x = abs(percent_change(self.expected_bw, dfs_max_write)) + actual_read_x = abs(percent_change(self.expected_bw, dfs_max_read)) + + # verify write performance + if actual_write_x > self.write_x: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append( + f"rank {rank} low write perf. " + f"BW: {dfs_max_write:.2f}/{self.expected_bw:.2f}; " + f"percent diff: {actual_write_x:.2f}/{self.write_x:.2f}") + + # verify read performance + if actual_read_x > self.read_x: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append( + f"rank {rank} low read perf. " + f"BW: {dfs_max_read:.2f}/{self.expected_bw:.2f}; " + f"percent diff: {actual_read_x:.2f}/{self.read_x:.2f}") + + except (TestFail, DaosTestError) as error: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append(str(error)) - # execute ior on given rank for different transfer sizes and collect the results - for idx, transfer_size in enumerate(self.transfer_sizes): - try: - self.ior_cmd.transfer_size.update(transfer_size) - self.ior_cmd.flags.update(self.write_flags) - dfs_out = self.run_ior_with_pool(fail_on_warning=self.log.info) - dfs_perf_write = IorCommand.get_ior_metrics(dfs_out) - self.ior_cmd.flags.update(self.read_flags) - dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) - dfs_perf_read = IorCommand.get_ior_metrics(dfs_out) - - # Destroy container, to be sure we use newly created container in next iteration - self.container.destroy() - self.container = None - - # gather actual and expected perf data to be compared - if idx == 0: - dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB]) - dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB]) - actual_write_x = percent_change(dfs_max_write, self.expected_bw) - actual_read_x = percent_change(dfs_max_read, self.expected_bw) - else: - dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_OPS]) - dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_OPS]) - actual_write_x = percent_change(dfs_max_write, self.expected_iops) - actual_read_x = percent_change(dfs_max_read, self.expected_iops) - - # compare actual and expected perf data - self.assertLessEqual(abs(actual_write_x), self.write_x, - "Max Write Diff too large for rank: {}".format(rank)) - self.assertLessEqual(abs(actual_read_x), self.read_x, - "Max Read Diff too large for rank: {}".format(rank)) - # collect list of good nodes - good_node = self.server_managers[0].get_host(rank) - if ((good_node not in self.good_nodes) - and (good_node not in self.failed_nodes)): - self.good_nodes.append(good_node) - except (TestFail, DaosTestError): - # collect bad nodes - failed_node = self.server_managers[0].get_host(rank) - if failed_node not in self.failed_nodes: - self.failed_nodes[failed_node] = [rank] - else: - self.failed_nodes[failed_node].append(rank) - if failed_node in self.good_nodes: - self.good_nodes.remove(failed_node) # Destroy pool, to be sure we use newly created pool in next iteration self.pool.destroy() self.pool = None @@ -100,8 +100,6 @@ def test_ior_per_rank(self): # test params self.failed_nodes = {} - self.good_nodes = [] - self.transfer_sizes = self.params.get("transfer_sizes", self.ior_cmd.namespace) self.write_flags = self.params.get("write_flags", self.ior_cmd.namespace) self.read_flags = self.params.get("read_flags", self.ior_cmd.namespace) @@ -122,13 +120,15 @@ def test_ior_per_rank(self): for rank in rank_list: self.execute_ior_per_rank(rank) - # list of good nodes - if self.good_nodes: - self.log.info("List of good nodes: %s", self.good_nodes) + # the good nodes are any that did not fail + good_nodes = self.hostlist_servers - NodeSet.fromlist(self.failed_nodes.keys()) + if good_nodes: + self.log.info("Good nodes: %s", good_nodes) # list the failed node and the rank number associated with that node if self.failed_nodes: - self.log.info("List of failed ranks with corresponding nodes") - for node, rank in self.failed_nodes.items(): - self.log.info("Node: %s, Rank: %s", node, rank) + self.log.info("List of failed nodes with corresponding ranks") + for node, reason_list in self.failed_nodes.items(): + for reason in reason_list: + self.log.info("%s: %s", node, reason) self.fail("Performance check failed for one or more nodes") diff --git a/src/tests/ftest/deployment/ior_per_rank.yaml b/src/tests/ftest/deployment/ior_per_rank.yaml index 406ef6dfff9..3a45226b5ca 100644 --- a/src/tests/ftest/deployment/ior_per_rank.yaml +++ b/src/tests/ftest/deployment/ior_per_rank.yaml @@ -23,12 +23,12 @@ server_config: pool: mode: 146 size: 750G # Cannot use percentage, as it does not work when using pool create for per rank. - control_method: dmg properties: ec_cell_sz:128KiB container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos + oclass: SX ior: client_processes: ppn: 32 @@ -36,13 +36,11 @@ ior: test_file: testFile write_flags: "-w -C -e -g -G 27 -k -Q 1" read_flags: "-r -R -C -e -g -G 27 -k -Q 1" - sw_deadline: 30 + sw_deadline: 15 sw_wearout: 1 sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile" - dfs_oclass: 'SX' - transfer_sizes: - - 1M - - 256B + dfs_oclass: SX + transfer_size: 1M block_size: 150G # 0.5 only for CI, due to the varying nature of different clusters in CI. # Change it to 15% (0.15) for Aurora. diff --git a/src/tests/ftest/nvme/fragmentation.py b/src/tests/ftest/nvme/fragmentation.py index 65dd3406ef6..fe09ef50f72 100644 --- a/src/tests/ftest/nvme/fragmentation.py +++ b/src/tests/ftest/nvme/fragmentation.py @@ -35,7 +35,7 @@ def setUp(self): self.ior_transfer_size = self.params.get("transfer_block_size", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.pool = None self.out_queue = queue.Queue() diff --git a/src/tests/ftest/nvme/health.py b/src/tests/ftest/nvme/health.py index f1de78a7546..5a6ac8b0f18 100644 --- a/src/tests/ftest/nvme/health.py +++ b/src/tests/ftest/nvme/health.py @@ -127,12 +127,12 @@ def test_monitor_for_large_pools(self): if device['uuid'] != uuid: error_msg = ' <== ERROR: UNEXPECTED DEVICE UUID' errors += 1 - elif device['dev_state'].lower() != 'normal': + elif device['ctrlr']['dev_state'].lower() != 'normal': error_msg = ' <== ERROR: STATE NOT NORMAL' errors += 1 self.log.info( ' health is %s for %s%s', - device['dev_state'], device['uuid'], error_msg) + device['ctrlr']['dev_state'], device['uuid'], error_msg) except KeyError as error: self.fail( "Error parsing dmg.storage_query_device_health() output: {}".format( diff --git a/src/tests/ftest/nvme/pool_capacity.py b/src/tests/ftest/nvme/pool_capacity.py index 2f88d012092..4c417aa83a9 100644 --- a/src/tests/ftest/nvme/pool_capacity.py +++ b/src/tests/ftest/nvme/pool_capacity.py @@ -32,7 +32,7 @@ def setUp(self): self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.out_queue = queue.Queue() def ior_thread(self, pool, oclass, api, test, flags, results): diff --git a/src/tests/ftest/nvme/pool_exclude.py b/src/tests/ftest/nvme/pool_exclude.py index 578070c1c47..72a6c9b50ad 100644 --- a/src/tests/ftest/nvme/pool_exclude.py +++ b/src/tests/ftest/nvme/pool_exclude.py @@ -32,8 +32,7 @@ def setUp(self): self.dmg_command = self.get_dmg_command() self.ior_test_sequence = self.params.get("ior_test_sequence", "/run/ior/iorflags/*") # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file( - self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.pool = None self.cont_list = [] self.dmg_command.exit_status_exception = True diff --git a/src/tests/ftest/nvme/pool_extend.py b/src/tests/ftest/nvme/pool_extend.py index 18e48ced8a1..876050a8561 100644 --- a/src/tests/ftest/nvme/pool_extend.py +++ b/src/tests/ftest/nvme/pool_extend.py @@ -30,7 +30,7 @@ def setUp(self): super().setUp() # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.dmg_command.exit_status_exception = True def run_nvme_pool_extend(self, num_pool, oclass=None): diff --git a/src/tests/ftest/osa/offline_drain.py b/src/tests/ftest/osa/offline_drain.py index 26be76d95a8..ea31e86cf93 100644 --- a/src/tests/ftest/osa/offline_drain.py +++ b/src/tests/ftest/osa/offline_drain.py @@ -29,8 +29,7 @@ def setUp(self): self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file( - self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) def run_offline_drain_test(self, num_pool, data=False, oclass=None, pool_fillup=0): """Run the offline drain without data. diff --git a/src/tests/ftest/osa/offline_reintegration.py b/src/tests/ftest/osa/offline_reintegration.py index 6491a477d7a..d88a2ee2f9c 100644 --- a/src/tests/ftest/osa/offline_reintegration.py +++ b/src/tests/ftest/osa/offline_reintegration.py @@ -30,8 +30,7 @@ def setUp(self): self.ior_test_repetitions = self.params.get("pool_test_repetitions", '/run/pool_capacity/*') self.loop_test_cnt = 1 # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file( - self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.dmg_command.exit_status_exception = True def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, diff --git a/src/tests/ftest/osa/online_drain.py b/src/tests/ftest/osa/online_drain.py index bf5804c3767..6bef36ead89 100644 --- a/src/tests/ftest/osa/online_drain.py +++ b/src/tests/ftest/osa/online_drain.py @@ -28,8 +28,7 @@ def setUp(self): "ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file( - self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.dmg_command.exit_status_exception = True self.pool = None diff --git a/src/tests/ftest/osa/online_extend.py b/src/tests/ftest/osa/online_extend.py index b1ae7c247f3..d8c8b18e607 100644 --- a/src/tests/ftest/osa/online_extend.py +++ b/src/tests/ftest/osa/online_extend.py @@ -33,7 +33,7 @@ def setUp(self): self.extra_servers = self.get_hosts_from_yaml( "test_servers", "server_partition", "server_reservation", "/run/extra_servers/*") # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.pool = None self.dmg_command.exit_status_exception = True self.daos_racer = None diff --git a/src/tests/ftest/osa/online_parallel_test.py b/src/tests/ftest/osa/online_parallel_test.py index c99a45dbd66..23addc6c392 100644 --- a/src/tests/ftest/osa/online_parallel_test.py +++ b/src/tests/ftest/osa/online_parallel_test.py @@ -35,7 +35,7 @@ def setUp(self): self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() diff --git a/src/tests/ftest/osa/online_reintegration.py b/src/tests/ftest/osa/online_reintegration.py index 3e4cda5eb3c..3fcc74e7629 100644 --- a/src/tests/ftest/osa/online_reintegration.py +++ b/src/tests/ftest/osa/online_reintegration.py @@ -31,7 +31,7 @@ def setUp(self): self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') # Recreate the client hostfile without slots defined - self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) + self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir) self.pool = None self.ds_racer_queue = queue.Queue() self.daos_racer = None diff --git a/src/tests/ftest/pool/create_all_hw.yaml b/src/tests/ftest/pool/create_all_hw.yaml index 3951e3394cc..d84c5762652 100644 --- a/src/tests/ftest/pool/create_all_hw.yaml +++ b/src/tests/ftest/pool/create_all_hw.yaml @@ -9,7 +9,7 @@ hosts: timeouts: test_one_pool_hw: 240 test_two_pools_hw: 320 - test_recycle_pools_hw: 1050 + test_recycle_pools_hw: 1200 test_one_pool_hw: deltas: diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index 3036c92b397..89930160e3c 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -561,7 +561,7 @@ def main(): logger.error(str(error)) sys.exit(1) - # Slurm Startup + # Slurm Start-up try: slurm_setup.start_slurm(args.user, args.debug) except SlurmSetupException as error: diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 78a75dae36a..5bda1cba350 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -694,7 +694,7 @@ def setUp(self): self.agent_manager_class = self.params.get( "agent_manager_class", "/run/setup/*", self.agent_manager_class) - # Support configuring the startup of servers and agents by the setup() + # Support configuring the start-up of servers and agents by the setup() # method from the test yaml file self.setup_start_servers = self.params.get( "start_servers", "/run/setup/*", self.setup_start_servers) diff --git a/src/tests/ftest/util/apricot/setup.py b/src/tests/ftest/util/apricot/setup.py index 3734157b664..948cd5f7005 100644 --- a/src/tests/ftest/util/apricot/setup.py +++ b/src/tests/ftest/util/apricot/setup.py @@ -3,6 +3,7 @@ setup(name='apricot', description='Apricot - Avocado SubFramwork', + # pylint: disable-next=consider-using-with version=open("VERSION", "r").read().strip(), author='Apricot Developers', author_email='apricot-devel@example.com', diff --git a/src/tests/ftest/util/command_utils_base.py b/src/tests/ftest/util/command_utils_base.py index 07672cbdf5f..23f18f8d98f 100644 --- a/src/tests/ftest/util/command_utils_base.py +++ b/src/tests/ftest/util/command_utils_base.py @@ -715,7 +715,7 @@ def __init__(self, namespace, title, log_dir): """ super().__init__(namespace, None, title) self._log_dir = log_dir - default_insecure = str(os.environ.get("DAOS_INSECURE_MODE", True)) + default_insecure = str(os.environ.get("DAOS_TEST_INSECURE_MODE", True)) default_insecure = default_insecure.lower() == "true" self.ca_cert = LogParameter(self._log_dir, None, "daosCA.crt") self.allow_insecure = BasicParameter(None, default_insecure) diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 7d336c4304d..53913ff934a 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -406,6 +406,8 @@ def storage_query_usage(self): # "serial": "CVFT534200AY400BGN", # "pci_addr": "0000:05:00.0", # "fw_rev": "8DV10131", + # "vendor_id": "0x8086", + # "pci_type": "", # "socket_id": 0, # "health_stats": null, # "namespaces": [ @@ -416,7 +418,7 @@ def storage_query_usage(self): # ], # "smd_devices": [ # { - # "dev_state": "NORMAL", + # "role_bits": 0, # "uuid": "259608d1-c469-4684-9986-9f7708b20ca3", # "tgt_ids": [ 0, 1, 2, 3, 4, 5, 6, 7 ], # "rank": 0, @@ -428,12 +430,28 @@ def storage_query_usage(self): # "meta_wal_size": 0, # "rdb_size": 134217728, # "rdb_wal_size": 268435456, - # "health": null, - # "tr_addr": "0000:05:00.0", - # "roles": "data", - # "has_sys_xs": false + # "roles": "NA", + # "has_sys_xs": false, + # "ctrlr": { + # "info": "", + # "model": "", + # "serial": "", + # "pci_addr": "", + # "fw_rev": "", + # "vendor_id": "", + # "pci_type": "", + # "socket_id": 0, + # "health_stats": null, + # "namespaces": null, + # "smd_devices": null, + # "dev_state": "UNKNOWN", + # "led_state": "OFF" + # }, + # "ctrlr_namespace_id": 1 # } # ] + # "dev_state": "NORMAL", + # "led_state": "NA", # } # ], # "scm_modules": null, diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index 5b8839bfc12..094ae211794 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -155,6 +155,9 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu all_hosts = NodeSet() all_hosts.update(servers) all_hosts.update(clients) + self.provider = provider + self.insecure_mode = insecure_mode + if self.log_dir is None: self.log_dir = self.default_log_dir() if self.shared_dir is None: @@ -165,12 +168,8 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu self.user_dir = self.default_user_dir() if self.interface is None: self.interface = self.default_interface(logger, all_hosts) - if self.provider is None: - self.provider = provider if self.provider is None: self.provider = self.default_provider(logger, servers) - if self.insecure_mode is None: - self.insecure_mode = insecure_mode if self.insecure_mode is None: self.insecure_mode = self.default_insecure_mode() if self.bullseye_src is None: diff --git a/src/tests/ftest/util/io_utilities.py b/src/tests/ftest/util/io_utilities.py index 125248c8e20..95eee4a9e33 100644 --- a/src/tests/ftest/util/io_utilities.py +++ b/src/tests/ftest/util/io_utilities.py @@ -105,7 +105,7 @@ def set_needles_prefix(self, prefix): def get_probe(self): """ Returns a tuple containing a needle file name randomly selected and the - absolute pathname of that file, in that order. + absolute path-name of that file, in that order. """ if not self._needles_paths: raise ValueError( diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index f9e9ad82d95..2ac3b3f95af 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -16,6 +16,7 @@ "ofi+cxi", "ofi+verbs;ofi_rxm", "ucx+dc_x", + "ucx+ud_x", "ofi+tcp;ofi_rxm", "ofi+opx" ) diff --git a/src/tests/ftest/util/pool_create_all_base.py b/src/tests/ftest/util/pool_create_all_base.py index 121860669bc..f3b099f5124 100644 --- a/src/tests/ftest/util/pool_create_all_base.py +++ b/src/tests/ftest/util/pool_create_all_base.py @@ -60,10 +60,8 @@ def get_usable_bytes(self): nvme_bytes = 0 for nvme_device in host_storage["storage"]["nvme_devices"]: - if nvme_device["smd_devices"] is None: - continue - for smd_device in nvme_device["smd_devices"]: - if smd_device["dev_state"] == "NORMAL": + if nvme_device["dev_state"] == "NORMAL": + for smd_device in (nvme_device["smd_devices"] or []): nvme_bytes += smd_device["usable_bytes"] nvme_engine_bytes = min(nvme_engine_bytes, nvme_bytes) @@ -310,11 +308,10 @@ def check_pool_distribution(self, scm_delta_bytes, nvme_delta_bytes=None): nvme_bytes = 0 for nvme_device in host_storage["storage"]["nvme_devices"]: - for smd_device in nvme_device["smd_devices"]: - if smd_device["dev_state"] != "NORMAL": - continue - nvme_bytes += smd_device["total_bytes"] - nvme_bytes -= smd_device["avail_bytes"] + if nvme_device["dev_state"] == "NORMAL": + for smd_device in (nvme_device["smd_devices"] or []): + nvme_bytes += smd_device["total_bytes"] + nvme_bytes -= smd_device["avail_bytes"] if nvme_bytes < nvme_used_bytes[0]: nvme_used_bytes[0] = nvme_bytes if nvme_bytes > nvme_used_bytes[1]: diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index a9f286bc7e8..eece8a27399 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -1042,7 +1042,7 @@ def update_config_file_from_file(self, generated_yaml): Use the specified data to generate and distribute the server configuration to the hosts. Also use this data to replace the engine storage configuration so that the storage options - defined in the specified data are configured correctly as part of the server startup. + defined in the specified data are configured correctly as part of the server start-up. Args: generated_yaml (YAMLObject): New server config data. diff --git a/src/tests/ftest/util/server_utils_base.py b/src/tests/ftest/util/server_utils_base.py index a853a961646..bd8b43acefd 100644 --- a/src/tests/ftest/util/server_utils_base.py +++ b/src/tests/ftest/util/server_utils_base.py @@ -538,7 +538,7 @@ def __init__(self): "/run/daos_server/storage/prepare/*", "prepare") # daos_server storage prepare command options: - # --pci-allowlist= Whitespace separated list of PCI + # --pci-allowlist= White-space separated list of PCI # devices (by address) to be unbound from # Kernel driver and used with SPDK # (default is all PCI devices). diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 9aaec2fdbf1..e5ee2c2a1fb 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -427,8 +427,8 @@ def launch_vmd_identify_check(self, name, results, args): for value in list(result['response']['host_storage_map'].values()): if value['storage']['smd_info']['devices']: for device in value['storage']['smd_info']['devices']: - if device['led_state'] != "QUICK_BLINK": - failing_vmd.append([device['tr_addr'], value['hosts']]) + if device['ctrlr']['led_state'] != "QUICK_BLINK": + failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']]) status = False params = {"name": name, diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 4fef0130aa7..abccc976a91 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -34,6 +34,9 @@ class TelemetryUtils(): "engine_pool_ops_dkey_punch", "engine_pool_ops_dtx_abort", "engine_pool_ops_dtx_check", + "engine_pool_ops_dtx_coll_abort", + "engine_pool_ops_dtx_coll_check", + "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", "engine_pool_ops_ec_agg", @@ -353,6 +356,18 @@ class TelemetryUtils(): "engine_io_ops_migrate_latency_mean", "engine_io_ops_migrate_latency_min", "engine_io_ops_migrate_latency_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS = [ + "engine_io_ops_obj_coll_punch_active", + "engine_io_ops_obj_coll_punch_active_max", + "engine_io_ops_obj_coll_punch_active_mean", + "engine_io_ops_obj_coll_punch_active_min", + "engine_io_ops_obj_coll_punch_active_stddev"] + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS = [ + "engine_io_ops_obj_coll_punch_latency", + "engine_io_ops_obj_coll_punch_latency_max", + "engine_io_ops_obj_coll_punch_latency_mean", + "engine_io_ops_obj_coll_punch_latency_min", + "engine_io_ops_obj_coll_punch_latency_stddev"] ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS = [ "engine_io_ops_obj_enum_active", "engine_io_ops_obj_enum_active_max", @@ -481,6 +496,8 @@ class TelemetryUtils(): ENGINE_IO_OPS_KEY2ANCHOR_LATENCY_METRICS +\ ENGINE_IO_OPS_MIGRATE_ACTIVE_METRICS +\ ENGINE_IO_OPS_MIGRATE_LATENCY_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_ACTIVE_METRICS +\ + ENGINE_IO_OPS_OBJ_COLL_PUNCH_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_ACTIVE_METRICS +\ ENGINE_IO_OPS_OBJ_ENUM_LATENCY_METRICS +\ ENGINE_IO_OPS_OBJ_PUNCH_ACTIVE_METRICS +\ @@ -563,8 +580,7 @@ class TelemetryUtils(): "engine_mem_vos_dtx_cmt_ent_48", "engine_mem_vos_vos_obj_360", "engine_mem_vos_vos_lru_size", - "engine_mem_dtx_dtx_leader_handle_344", - "engine_mem_dtx_dtx_entry_40"] + "engine_mem_dtx_dtx_leader_handle_360"] ENGINE_MEM_TOTAL_USAGE_METRICS = [ "engine_mem_total_mem"] diff --git a/src/tests/ftest/util/write_host_file.py b/src/tests/ftest/util/write_host_file.py index 8a6e810c6ba..a967c4f3232 100644 --- a/src/tests/ftest/util/write_host_file.py +++ b/src/tests/ftest/util/write_host_file.py @@ -4,18 +4,18 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import random from logging import getLogger +from tempfile import mkstemp -def write_host_file(hosts, path='/tmp', slots=1): +def write_host_file(hosts, path='/tmp', slots=None): """Write out a hostfile suitable for orterun. Args: hosts (NodeSet): hosts to write to the hostfile path (str, optional): where to write the hostfile. Defaults to '/tmp'. slots (int, optional): slots per host to specify in the hostfile. - Defaults to 1. + Defaults to None. Raises: ValueError: if no hosts have been specified @@ -24,23 +24,19 @@ def write_host_file(hosts, path='/tmp', slots=1): str: the full path of the written hostfile """ - log = getLogger() - unique = random.randint(1, 100000) # nosec + if not hosts: + raise ValueError("hosts parameter must be provided.") - if not os.path.exists(path): - os.makedirs(path) - hostfile = os.path.join(path, "".join(["hostfile", str(unique)])) + log = getLogger() + os.makedirs(path, exist_ok=True) - if not hosts: - raise ValueError("host list parameter must be provided.") + _, hostfile = mkstemp(dir=path, prefix='hostfile_') log.debug("Writing hostfile: %s (hosts=%s, slots=%s)", hostfile, hosts, slots) with open(hostfile, "w") as hostfile_handle: - for host in hosts: - hostfile_line = [host] - if slots: - hostfile_line.append(f"slots={slots}") - hostfile_handle.write(f"{' '.join(hostfile_line)}\n") - log.debug(" %s", " ".join(hostfile_line)) + if slots: + hostfile_handle.writelines(f'{host} slots={slots}\n' for host in sorted(hosts)) + else: + hostfile_handle.writelines(f'{host}\n' for host in sorted(hosts)) return hostfile diff --git a/src/tests/ftest/vmd/fault_reintegration.py b/src/tests/ftest/vmd/fault_reintegration.py index 0380fa9effc..1b4c14402c2 100644 --- a/src/tests/ftest/vmd/fault_reintegration.py +++ b/src/tests/ftest/vmd/fault_reintegration.py @@ -62,8 +62,10 @@ def check_result(self, result, dev_state, led_state): for device in value['storage']['smd_info']['devices']: self.log.debug( 'Verifying: dev_state (%s == %s) and led_state (%s == %s)', - device['dev_state'], dev_state, device['led_state'], led_state) - if device['dev_state'] == dev_state and device['led_state'] == led_state: + device['ctrlr']['dev_state'], dev_state, device['ctrlr']['led_state'], + led_state) + if device['ctrlr']['dev_state'] == dev_state and \ + device['ctrlr']['led_state'] == led_state: return True return False diff --git a/src/tests/suite/daos_obj.c b/src/tests/suite/daos_obj.c index 5415d3fa9fa..0cabff2be2e 100644 --- a/src/tests/suite/daos_obj.c +++ b/src/tests/suite/daos_obj.c @@ -5115,6 +5115,79 @@ oit_list_filter(void **state) test_teardown((void **)&arg); } +#define DTS_DKEY_CNT 8 +#define DTS_DKEY_SIZE 16 +#define DTS_IOSIZE 64 + +static void +obj_coll_punch(test_arg_t *arg, daos_oclass_id_t oclass) +{ + char buf[DTS_IOSIZE]; + char dkeys[DTS_DKEY_CNT][DTS_DKEY_SIZE]; + const char *akey = "daos_io_akey"; + daos_obj_id_t oid; + struct ioreq req; + int i; + + oid = daos_test_oid_gen(arg->coh, oclass, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + for (i = 0; i < DTS_DKEY_CNT; i++) { + dts_buf_render(dkeys[i], DTS_DKEY_SIZE); + dts_buf_render(buf, DTS_IOSIZE); + insert_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + } + + print_message("Collective punch object\n"); + punch_obj(DAOS_TX_NONE, &req); + + print_message("Fetch after punch\n"); + arg->expect_result = -DER_NONEXIST; + for (i = 0; i < DTS_DKEY_CNT; i++) + lookup_empty_single(dkeys[i], akey, 0, buf, DTS_IOSIZE, DAOS_TX_NONE, &req); + + ioreq_fini(&req); +} + +static void +io_50(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_SX\n"); + + if (!test_runable(arg, 2)) + return; + + obj_coll_punch(arg, OC_SX); +} + +static void +io_51(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_2P1G2\n"); + + if (!test_runable(arg, 3)) + return; + + obj_coll_punch(arg, OC_EC_2P1G2); +} + +static void +io_52(void **state) +{ + test_arg_t *arg = *state; + + print_message("Collective punch object - OC_EC_4P1GX\n"); + + if (!test_runable(arg, 5)) + return; + + obj_coll_punch(arg, OC_EC_4P1GX); +} + static const struct CMUnitTest io_tests[] = { { "IO1: simple update/fetch/verify", io_simple, async_disable, test_case_teardown}, @@ -5213,6 +5286,12 @@ static const struct CMUnitTest io_tests[] = { { "IO47: obj_open perf", obj_open_perf, async_disable, test_case_teardown}, { "IO48: oit_list_filter", oit_list_filter, async_disable, test_case_teardown}, { "IO49: oit_list_filter async", oit_list_filter, async_enable, test_case_teardown}, + { "IO50: collective punch object - OC_SX", + io_50, NULL, test_case_teardown}, + { "IO51: collective punch object - OC_EC_2P1G2", + io_51, NULL, test_case_teardown}, + { "IO52: collective punch object - OC_EC_4P1GX", + io_52, NULL, test_case_teardown}, }; int diff --git a/src/tests/suite/daos_rebuild.c b/src/tests/suite/daos_rebuild.c index 6d80e9d2a04..f64e1cda2b4 100644 --- a/src/tests/suite/daos_rebuild.c +++ b/src/tests/suite/daos_rebuild.c @@ -1114,7 +1114,7 @@ rebuild_fail_all_replicas_before_rebuild(void **state) struct daos_obj_shard *shard; int rc; - if (!test_runable(arg, 6) || arg->pool.alive_svc->rl_nr < 3) + if (!test_runable(arg, 6) || arg->pool.alive_svc->rl_nr < 5) return; oid = daos_test_oid_gen(arg->coh, DAOS_OC_R2S_SPEC_RANK, 0, 0, @@ -1172,11 +1172,11 @@ rebuild_fail_all_replicas(void **state) int rc; /* This test will kill 3 replicas, which might include the ranks - * in svcs, so make sure there are at least 6 ranks in svc, so + * in svcs, so make sure there are at least 7 ranks in svc, so * the new leader can be chosen. */ - if (!test_runable(arg, 6) || arg->pool.alive_svc->rl_nr < 6) { - print_message("need at least 6 svcs, -s6\n"); + if (!test_runable(arg, 7) || arg->pool.alive_svc->rl_nr < 7) { + print_message("need at least 7 svcs, -s7\n"); return; } diff --git a/src/tests/suite/daos_rebuild_ec.c b/src/tests/suite/daos_rebuild_ec.c index 072bb9a4692..b5018e50111 100644 --- a/src/tests/suite/daos_rebuild_ec.c +++ b/src/tests/suite/daos_rebuild_ec.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1254,7 +1254,8 @@ rebuild_ec_multiple_failure_tgts(void **state) } static void -rebuild_ec_parity_overwrite_fail_parity(void **state) +rebuild_ec_parity_overwrite_fail_parity_internal(void **state, int *kill_shards, int nr, + bool aggregation) { test_arg_t *arg = *state; daos_obj_id_t oid; @@ -1262,15 +1263,19 @@ rebuild_ec_parity_overwrite_fail_parity(void **state) char *data; char *verify_data; daos_recx_t recx; - d_rank_t data_ranks[2]; + d_rank_t data_ranks[4]; d_rank_t parity_rank; uint64_t dkey_hash; int shard_idx; int stripe_size = 4 * CELL_SIZE; + int i; if (!test_runable(arg, 8)) return; + if (svc_nreplicas < 5) + return; + oid = daos_test_oid_gen(arg->coh, OC_EC_4P2G1, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); data = (char *)malloc(stripe_size); @@ -1282,6 +1287,9 @@ rebuild_ec_parity_overwrite_fail_parity(void **state) insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, data, stripe_size, &req); + if (aggregation) + trigger_and_wait_ec_aggreation(arg, &oid, 1, "d_key", "a_key", 0, + 0, DAOS_FORCE_EC_AGG); make_buffer(data, 'b', 1000); memcpy(verify_data, data, stripe_size); recx.rx_idx = 0; @@ -1296,13 +1304,16 @@ rebuild_ec_parity_overwrite_fail_parity(void **state) parity_rank = get_rank_by_oid_shard(arg, oid, shard_idx); rebuild_single_pool_rank(arg, parity_rank, true); + print_message("sleep 60 seconds for aggregation\n"); + sleep(60); + /* fail data shard */ - shard_idx = (dkey_hash % 6 + 0) % 6; - data_ranks[0] = get_rank_by_oid_shard(arg, oid, shard_idx); - shard_idx = (dkey_hash % 6 + 1) % 6; - data_ranks[1] = get_rank_by_oid_shard(arg, oid, shard_idx); + for (i = 0; i < nr; i++) { + shard_idx = (dkey_hash % 6 + kill_shards[i]) % 6; + data_ranks[i] = get_rank_by_oid_shard(arg, oid, shard_idx); + } - rebuild_pools_ranks(&arg, 1, data_ranks, 2, true); + rebuild_pools_ranks(&arg, 1, data_ranks, nr, true); recx.rx_idx = 0; /* full stripe */ recx.rx_nr = stripe_size; lookup_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, @@ -1310,7 +1321,7 @@ rebuild_ec_parity_overwrite_fail_parity(void **state) assert_memory_equal(data, verify_data, stripe_size); - reintegrate_pools_ranks(&arg, 1, data_ranks, 2, true); + reintegrate_pools_ranks(&arg, 1, data_ranks, nr, true); reintegrate_pools_ranks(&arg, 1, &parity_rank, 1, true); lookup_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, @@ -1324,6 +1335,26 @@ rebuild_ec_parity_overwrite_fail_parity(void **state) free(verify_data); } +static void +rebuild_ec_overwrite_fail_parity_data(void **state) +{ + int kill_shards[2]; + + kill_shards[0] = 0; + kill_shards[1] = 1; + rebuild_ec_parity_overwrite_fail_parity_internal(state, kill_shards, 2, false); +} + +static void +rebuild_ec_overwrite_fail_parity_data_with_parity(void **state) +{ + int kill_shards[2]; + + kill_shards[0] = 1; + kill_shards[1] = 2; + rebuild_ec_parity_overwrite_fail_parity_internal(state, kill_shards, 2, true); +} + /** create a new pool/container for each test */ static const struct CMUnitTest rebuild_tests[] = { {"REBUILD0: rebuild partial update with data tgt fail", @@ -1457,7 +1488,10 @@ static const struct CMUnitTest rebuild_tests[] = { rebuild_ec_multiple_failure_tgts, rebuild_ec_8nodes_setup, test_teardown}, {"REBUILD46: fail parity shard and data shards after overwrite", - rebuild_ec_parity_overwrite_fail_parity, rebuild_ec_8nodes_setup, + rebuild_ec_overwrite_fail_parity_data, rebuild_ec_8nodes_setup, + test_teardown}, + {"REBUILD47: fail parity shard and data shards after overwrite with aggregation", + rebuild_ec_overwrite_fail_parity_data_with_parity, rebuild_ec_8nodes_setup, test_teardown}, }; diff --git a/src/tests/suite/daos_test.h b/src/tests/suite/daos_test.h index b0c43d5b9f7..4c8d6e8f31e 100644 --- a/src/tests/suite/daos_test.h +++ b/src/tests/suite/daos_test.h @@ -377,7 +377,7 @@ daos_prop_t *get_daos_prop_with_owner_and_acl(char *owner, uint32_t owner_type, typedef int (*test_setup_cb_t)(void **state); typedef int (*test_teardown_cb_t)(void **state); -bool test_runable(test_arg_t *arg, unsigned int required_tgts); +bool test_runable(test_arg_t *arg, unsigned int required_nodes); int test_pool_get_info(test_arg_t *arg, daos_pool_info_t *pinfo, d_rank_list_t **engine_ranks); int test_get_leader(test_arg_t *arg, d_rank_t *rank); bool test_rebuild_query(test_arg_t **args, int args_cnt); diff --git a/src/tests/suite/daos_test_common.c b/src/tests/suite/daos_test_common.c index e8b0faf9465..7de287713ce 100644 --- a/src/tests/suite/daos_test_common.c +++ b/src/tests/suite/daos_test_common.c @@ -966,8 +966,8 @@ daos_kill_server(test_arg_t *arg, const uuid_t pool_uuid, rank = arg->srv_nnodes - disable_nodes - 1; arg->srv_disabled_ntgts += tgts_per_node; - if (d_rank_in_rank_list(svc, rank)) - svc->rl_nr--; + rc = d_rank_list_del(svc, rank); + assert_rc_equal(rc, 0); print_message("\tKilling rank %d (total of %d with %d already " "disabled, svc->rl_nr %d)!\n", rank, arg->srv_ntgts, arg->srv_disabled_ntgts - 1, svc->rl_nr); diff --git a/src/tests/suite/dfs_par_test.c b/src/tests/suite/dfs_par_test.c index 8b397d90712..eea7e27d3c7 100644 --- a/src/tests/suite/dfs_par_test.c +++ b/src/tests/suite/dfs_par_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1026,7 +1026,7 @@ run_dfs_par_test(int rank, int size) par_barrier(PAR_COMM_WORLD); /** run tests again with DTX */ - setenv("DFS_USE_DTX", "1", 1); + d_setenv("DFS_USE_DTX", "1", 1); par_barrier(PAR_COMM_WORLD); rc += cmocka_run_group_tests_name("DAOS_FileSystem_DFS_Parallel_DTX", dfs_par_tests, diff --git a/src/tests/suite/dfs_unit_test.c b/src/tests/suite/dfs_unit_test.c index 2e510e33a32..6c2bf8fe1bb 100644 --- a/src/tests/suite/dfs_unit_test.c +++ b/src/tests/suite/dfs_unit_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -3375,7 +3375,7 @@ run_dfs_unit_test(int rank, int size) par_barrier(PAR_COMM_WORLD); /** run tests again with DTX */ - setenv("DFS_USE_DTX", "1", 1); + d_setenv("DFS_USE_DTX", "1", 1); par_barrier(PAR_COMM_WORLD); rc += cmocka_run_group_tests_name("DAOS_FileSystem_DFS_Unit_DTX", dfs_unit_tests, diff --git a/src/tests/suite/dfuse_test.c b/src/tests/suite/dfuse_test.c index 1af1ccffa35..bcff68d5ecf 100644 --- a/src/tests/suite/dfuse_test.c +++ b/src/tests/suite/dfuse_test.c @@ -29,7 +29,7 @@ #include #include -#include "dfuse_ioctl.h" +#include /* Tests can be run by specifying the appropriate argument for a test or all will be run if no test * is specified. diff --git a/src/utils/crt_launch/crt_launch.c b/src/utils/crt_launch/crt_launch.c index 0de9281bece..64fccd719de 100644 --- a/src/utils/crt_launch/crt_launch.c +++ b/src/utils/crt_launch/crt_launch.c @@ -156,7 +156,7 @@ get_self_uri(struct host *h, int rank) if (str_port == NULL) return -DER_NOMEM; - setenv("OFI_PORT", str_port, 1); + d_setenv("OFI_PORT", str_port, 1); rc = crt_init(0, CRT_FLAG_BIT_SERVER | CRT_FLAG_BIT_AUTO_SWIM_DISABLE); if (rc != 0) { @@ -233,7 +233,7 @@ generate_group_file(int world_size, struct host *h) } fclose(f); - setenv("CRT_L_GRP_CFG", grp_info_template, true); + d_setenv("CRT_L_GRP_CFG", grp_info_template, true); return 0; } @@ -248,7 +248,7 @@ int main(int argc, char **argv) char str_rank[255]; char str_port[255]; - setenv("D_PORT_AUTO_ADJUST", "1", true); + d_setenv("D_PORT_AUTO_ADJUST", "1", true); if (argc < 2) { show_usage("Insufficient number of arguments"); @@ -314,8 +314,8 @@ int main(int argc, char **argv) sprintf(str_rank, "%d", hostbuf->my_rank); sprintf(str_port, "%d", hostbuf->ofi_port); /* Set CRT_L_RANK and OFI_PORT */ - setenv("CRT_L_RANK", str_rank, true); - setenv("D_PORT", str_port, true); + d_setenv("CRT_L_RANK", str_rank, true); + d_setenv("D_PORT", str_port, true); exit: if (hostbuf) diff --git a/src/utils/daos_dfs_hdlr.c b/src/utils/daos_dfs_hdlr.c index b7cccdf3d73..a986f05dcbb 100644 --- a/src/utils/daos_dfs_hdlr.c +++ b/src/utils/daos_dfs_hdlr.c @@ -19,10 +19,10 @@ #include #include -#include "daos_types.h" -#include "daos_fs.h" +#include +#include #include "dfs_internal.h" -#include "daos_uns.h" +#include #include "daos_hdlr.h" int diff --git a/src/utils/daos_hdlr.c b/src/utils/daos_hdlr.c index efece5c533a..eff26f7b508 100644 --- a/src/utils/daos_hdlr.c +++ b/src/utils/daos_hdlr.c @@ -32,12 +32,12 @@ #include #include -#include "daos_types.h" -#include "daos_api.h" -#include "daos_fs.h" -#include "daos_uns.h" -#include "daos_prop.h" -#include "daos_fs_sys.h" +#include +#include +#include +#include +#include +#include #include "daos_hdlr.h" diff --git a/src/utils/daos_metrics/daos_metrics.c b/src/utils/daos_metrics/daos_metrics.c index 8a8190d5203..b166b1a31f9 100644 --- a/src/utils/daos_metrics/daos_metrics.c +++ b/src/utils/daos_metrics/daos_metrics.c @@ -10,8 +10,8 @@ #include #include -#include "gurt/telemetry_common.h" -#include "gurt/telemetry_consumer.h" +#include +#include static void print_usage(const char *prog_name) diff --git a/src/utils/self_test/self_test.c b/src/utils/self_test/self_test.c index 0be2d7fa5e2..7c9d6e592d5 100644 --- a/src/utils/self_test/self_test.c +++ b/src/utils/self_test/self_test.c @@ -14,7 +14,7 @@ #include #include "crt_utils.h" -#include "daos_errno.h" +#include #include #include diff --git a/src/vea/vea_free.c b/src/vea/vea_free.c index d85cfdb8777..6b41df27c4c 100644 --- a/src/vea/vea_free.c +++ b/src/vea/vea_free.c @@ -1002,7 +1002,7 @@ trigger_aging_flush(struct vea_space_info *vsi, bool force, uint32_t nr_flush, u D_ASSERT(umem_tx_none(vsi->vsi_umem)); cur_time = get_current_age(); - rc = reclaim_unused_bitmap(vsi, 10); + rc = reclaim_unused_bitmap(vsi, MAX_FLUSH_FRAGS); if (rc) goto out; diff --git a/src/vos/ilog.c b/src/vos/ilog.c index e7dd2bff532..1d1d6508087 100644 --- a/src/vos/ilog.c +++ b/src/vos/ilog.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -81,6 +81,8 @@ struct ilog_context { bool ic_in_txn; /** version needs incrementing */ bool ic_ver_inc; + /** For operation with fixed epoch (rebuild, ec_agg, ec_rep, etc.) */ + bool ic_fixed_epoch; }; D_CASSERT(sizeof(struct ilog_id) == sizeof(struct ilog_tree)); @@ -401,7 +403,7 @@ ilog_create(struct umem_instance *umm, struct ilog_df *root) int ilog_open(struct umem_instance *umm, struct ilog_df *root, - const struct ilog_desc_cbs *cbs, daos_handle_t *loh) + const struct ilog_desc_cbs *cbs, bool fixed_epoch, daos_handle_t *loh) { struct ilog_context *lctx; int rc; @@ -412,6 +414,7 @@ ilog_open(struct umem_instance *umm, struct ilog_df *root, if (rc != 0) return rc; + lctx->ic_fixed_epoch = fixed_epoch; *loh = ilog_lctx2hdl(lctx); return 0; @@ -587,7 +590,7 @@ check_equal(struct ilog_context *lctx, struct ilog_id *id_out, const struct ilog D_DEBUG(DB_IO, "No entry found, done\n"); return 0; } - if (dtx_is_committed(id_in->id_tx_id, ilog_ctx2cont(lctx), id_in->id_epoch)) { + if (dtx_is_committed(id_out->id_tx_id, ilog_ctx2cont(lctx), id_out->id_epoch)) { /** Need to differentiate between updates that are * overwrites and others that are conflicts. Return * a different error code in this case if the result @@ -603,7 +606,15 @@ check_equal(struct ilog_context *lctx, struct ilog_id *id_out, const struct ilog id_out->id_update_minor_eph > id_out->id_punch_minor_eph) return -DER_ALREADY; + } else if (lctx->ic_fixed_epoch) { + /* + * For operation with fixed epoch, when update existing ilog entry, + * regard them as the same and use minor epoch for further handling. + */ + *is_equal = true; + return 0; } + D_DEBUG(DB_IO, "Access of incarnation log from multiple DTX" " at same time is not allowed: rc=DER_TX_RESTART\n"); return -DER_TX_RESTART; diff --git a/src/vos/ilog.h b/src/vos/ilog.h index 767064022dd..0cc7ceb5c4f 100644 --- a/src/vos/ilog.h +++ b/src/vos/ilog.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -88,14 +88,15 @@ ilog_create(struct umem_instance *umm, struct ilog_df *root); * * \param umm[IN] The umem instance * \param root[IN] A pointer to the allocated root - * \param cbs[in] Incarnation log transaction log callbacks + * \param cbs[IN] Incarnation log transaction log callbacks + * \param fixed_epoch[IN] It is for operation with fixed epoch or not * \param loh[OUT] Returned open log handle * * \return 0 on success, error code on failure */ int ilog_open(struct umem_instance *umm, struct ilog_df *root, - const struct ilog_desc_cbs *cbs, daos_handle_t *loh); + const struct ilog_desc_cbs *cbs, bool fixed_epoch, daos_handle_t *loh); /** Close an open incarnation log handle * diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index f24ef4fa820..5024e3e2bd8 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -56,7 +56,6 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, dth->dth_pinned = 0; dth->dth_sync = 0; dth->dth_cos_done = 0; - dth->dth_resent = 0; dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_solo = 0; diff --git a/src/vos/tests/vts_ilog.c b/src/vos/tests/vts_ilog.c index 1b85d2cb683..c696ff0b487 100644 --- a/src/vos/tests/vts_ilog.c +++ b/src/vos/tests/vts_ilog.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -482,6 +482,7 @@ ilog_test_update(void **state) struct umem_instance *umm; struct entries *entries = args->custom; struct ilog_df *ilog; + struct ilog_id id; struct version_cache version_cache; daos_epoch_t epoch; daos_handle_t loh; @@ -503,7 +504,7 @@ ilog_test_update(void **state) rc = ilog_create(umm, ilog); LOG_FAIL(rc, 0, "Failed to create a new incarnation log\n"); - rc = ilog_open(umm, ilog, &ilog_callbacks, &loh); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); LOG_FAIL(rc, 0, "Failed to open incarnation log\n"); version_cache_fetch(&version_cache, loh, true); @@ -529,6 +530,14 @@ ilog_test_update(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Commit the punch ilog. */ + id.id_epoch = epoch; + id.id_tx_id = current_tx_id.id_tx_id; + rc = ilog_persist(loh, &id); + assert_rc_equal(rc, 0); + + version_cache_fetch(&version_cache, loh, true); + /** Same epoch, different transaction, same operation. In other * words, both the existing entry and this one are punches so * we get back -DER_ALREADY because the existing entry covers @@ -644,7 +653,7 @@ ilog_test_abort(void **state) rc = ilog_create(umm, ilog); LOG_FAIL(rc, 0, "Failed to create a new incarnation log\n"); - rc = ilog_open(umm, ilog, &ilog_callbacks, &loh); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); LOG_FAIL(rc, 0, "Failed to open new incarnation log\n"); version_cache_fetch(&version_cache, loh, true); @@ -756,7 +765,7 @@ ilog_test_persist(void **state) rc = ilog_create(umm, ilog); LOG_FAIL(rc, 0, "Failed to create a new incarnation log\n"); - rc = ilog_open(umm, ilog, &ilog_callbacks, &loh); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); LOG_FAIL(rc, 0, "Failed to open incarnation log\n"); version_cache_fetch(&version_cache, loh, true); @@ -840,7 +849,7 @@ ilog_test_aggregate(void **state) rc = ilog_create(umm, ilog); LOG_FAIL(rc, 0, "Failed to create a new incarnation log\n"); - rc = ilog_open(umm, ilog, &ilog_callbacks, &loh); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); LOG_FAIL(rc, 0, "Failed to open incarnation log\n"); id.id_epoch = 1; @@ -956,7 +965,7 @@ ilog_test_discard(void **state) rc = ilog_create(umm, ilog); LOG_FAIL(rc, 0, "Failed to create a new incarnation log\n"); - rc = ilog_open(umm, ilog, &ilog_callbacks, &loh); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); LOG_FAIL(rc, 0, "Failed to open incarnation log\n"); id.id_epoch = 1; diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 14ac6407117..cf28bf0d573 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -294,7 +294,7 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, dae->dae_preparing = 0; } - if (unlikely(dth->dth_need_validation && dth->dth_active)) { + if (err == 0 && unlikely(dth->dth_need_validation && dth->dth_active)) { /* Aborted by race during the yield for local TX commit. */ rc = vos_dtx_validation(dth); switch (rc) { @@ -597,7 +597,7 @@ vos_mod_init(void) if (rc) D_ERROR("Failed to initialize incarnation log capability\n"); - d_getenv_int("DAOS_VOS_AGG_THRESH", &vos_agg_nvme_thresh); + d_getenv_uint("DAOS_VOS_AGG_THRESH", &vos_agg_nvme_thresh); if (vos_agg_nvme_thresh == 0 || vos_agg_nvme_thresh > 256) vos_agg_nvme_thresh = VOS_MW_NVME_THRESH; /* Round down to 2^n blocks */ diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index 93cc62ceeb5..95fd533ec7a 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -376,7 +376,7 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh) cont->vc_ts_idx = &cont->vc_cont_df->cd_ts_idx; cont->vc_dtx_active_hdl = DAOS_HDL_INVAL; cont->vc_dtx_committed_hdl = DAOS_HDL_INVAL; - if (umoff_is_null(cont->vc_cont_df->cd_dtx_committed_head)) + if (UMOFF_IS_NULL(cont->vc_cont_df->cd_dtx_committed_head)) cont->vc_cmt_dtx_indexed = 1; else cont->vc_cmt_dtx_indexed = 0; diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index cb87ba79866..ff3fc187bfa 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -138,6 +138,7 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, dsp->dsp_xid = DAE_XID(dae); dsp->dsp_oid = DAE_OID(dae); dsp->dsp_epoch = DAE_EPOCH(dae); + dsp->dsp_version = DAE_VER(dae); dsp->dsp_dkey_hash = DAE_DKEY_HASH(dae); mbs = (struct dtx_memberships *)(dsp + 1); @@ -211,6 +212,9 @@ dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, D_FREE(dae->dae_records); dae->dae_rec_cap = 0; DAE_REC_CNT(dae) = 0; + + dae->dae_df_off = UMOFF_NULL; + dae->dae_dbd = NULL; } static int @@ -461,7 +465,7 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df) if (rc != 0) return rc; - while (!umoff_is_null(cont_df->cd_dtx_committed_head)) { + while (!UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)) { dbd_off = cont_df->cd_dtx_committed_head; dbd = umem_off2ptr(umm, dbd_off); cont_df->cd_dtx_committed_head = dbd->dbd_next; @@ -480,22 +484,20 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df) if (rc != 0) return rc; - while (!umoff_is_null(cont_df->cd_dtx_active_head)) { + while (!UMOFF_IS_NULL(cont_df->cd_dtx_active_head)) { dbd_off = cont_df->cd_dtx_active_head; dbd = umem_off2ptr(umm, dbd_off); for (i = 0; i < dbd->dbd_index; i++) { dae_df = &dbd->dbd_active_data[i]; if (!(dae_df->dae_flags & DTE_INVALID)) { - if (!umoff_is_null(dae_df->dae_rec_off)) { - rc = umem_free(umm, - dae_df->dae_rec_off); + if (!UMOFF_IS_NULL(dae_df->dae_rec_off)) { + rc = umem_free(umm, dae_df->dae_rec_off); if (rc != 0) return rc; } - if (!umoff_is_null(dae_df->dae_mbs_off)) { - rc = umem_free(umm, - dae_df->dae_mbs_off); + if (!UMOFF_IS_NULL(dae_df->dae_mbs_off)) { + rc = umem_free(umm, dae_df->dae_mbs_off); if (rc != 0) return rc; } @@ -527,7 +529,7 @@ dtx_ilog_rec_release(struct umem_instance *umm, struct vos_container *cont, ilog = umem_off2ptr(umm, umem_off2offset(rec)); vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); - rc = ilog_open(umm, ilog, &cbs, &loh); + rc = ilog_open(umm, ilog, &cbs, false, &loh); if (rc != 0) return rc; @@ -554,7 +556,7 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, { int rc = 0; - if (umoff_is_null(rec)) + if (UMOFF_IS_NULL(rec)) return 0; switch (dtx_umoff_flag2type(rec)) { @@ -630,7 +632,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, int i; int rc = 0; - if (dae->dae_dbd == NULL) + if (unlikely(dae->dae_need_release == 0)) return 0; /* In spite of for commit or abort, the DTX must be local preparing/prepared. */ @@ -645,7 +647,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, "Invalid blob %p magic %x for "DF_DTI" (lid %x)\n", dbd, dbd->dbd_magic, DP_DTI(&DAE_XID(dae)), DAE_LID(dae)); - if (!umoff_is_null(dae_df->dae_mbs_off)) { + if (!UMOFF_IS_NULL(dae_df->dae_mbs_off)) { /* dae_mbs_off will be invalid via flag DTE_INVALID. */ rc = umem_free(umm, dae_df->dae_mbs_off); if (rc != 0) @@ -675,7 +677,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, return rc; } - if (!umoff_is_null(dae_df->dae_rec_off)) { + if (!UMOFF_IS_NULL(dae_df->dae_rec_off)) { rc = umem_free(umm, dae_df->dae_rec_off); if (rc != 0) return rc; @@ -747,6 +749,9 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); } + if (rc == 0) + dae->dae_need_release = 0; + return rc; } @@ -903,7 +908,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) int rc; dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); - if (umoff_is_null(dbd_off)) { + if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space when create active DTX table.\n"); return -DER_NOSPACE; } @@ -915,7 +920,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) tmp = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); if (tmp == NULL) { - D_ASSERT(umoff_is_null(cont_df->cd_dtx_active_head)); + D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_active_head)); /* cd_dtx_active_tail is next to cd_dtx_active_head */ rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_head, @@ -949,7 +954,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) } static int -vos_dtx_alloc(struct vos_dtx_blob_df *dbd, struct dtx_handle *dth) +vos_dtx_alloc(struct umem_instance *umm, struct vos_dtx_blob_df *dbd, struct dtx_handle *dth) { struct vos_dtx_act_ent *dae = NULL; struct vos_container *cont; @@ -995,9 +1000,9 @@ vos_dtx_alloc(struct vos_dtx_blob_df *dbd, struct dtx_handle *dth) if (dbd != NULL) { D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); - dae->dae_df_off = cont->vc_cont_df->cd_dtx_active_tail + - offsetof(struct vos_dtx_blob_df, dbd_active_data) + - sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; + dae->dae_df_off = umem_ptr2off(umm, dbd) + + offsetof(struct vos_dtx_blob_df, dbd_active_data) + + sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; } /* Will be set as dbd::dbd_index via vos_dtx_prepared(). */ @@ -1425,6 +1430,46 @@ vos_dtx_validation(struct dtx_handle *dth) return vos_dtx_status(dae); } +static int +vos_dtx_active(struct dtx_handle *dth) +{ + struct vos_dtx_act_ent *dae = dth->dth_ent; + struct vos_container *cont; + struct vos_cont_df *cont_df; + struct umem_instance *umm; + struct vos_dtx_blob_df *dbd; + int rc = 0; + + if (dae->dae_dbd != NULL) + goto out; + + cont = vos_hdl2cont(dth->dth_coh); + cont_df = cont->vc_cont_df; + umm = vos_cont2umm(cont); + dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); + + if (dbd == NULL || dbd->dbd_index >= dbd->dbd_cap) { + rc = vos_dtx_extend_act_table(cont); + if (rc != 0) + goto out; + + dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); + } + + D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); + + dae->dae_df_off = umem_ptr2off(umm, dbd) + + offsetof(struct vos_dtx_blob_df, dbd_active_data) + + sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; + dae->dae_dbd = dbd; + +out: + if (rc == 0) + dth->dth_active = 1; + + return rc; +} + /* The caller has started local transaction. */ int vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, @@ -1439,6 +1484,10 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, return 0; } + /* + * Check whether someone touched the DTX before we registering modification + * for the first time (during the prepare, such as bulk data transferring). + */ if (unlikely(dth->dth_need_validation && !dth->dth_active)) { rc = vos_dtx_validation(dth); switch (rc) { @@ -1489,32 +1538,9 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, } if (!dth->dth_active) { - struct vos_container *cont; - struct vos_cont_df *cont_df; - struct vos_dtx_blob_df *dbd; - - cont = vos_hdl2cont(dth->dth_coh); - D_ASSERT(cont != NULL); - - umm = vos_cont2umm(cont); - cont_df = cont->vc_cont_df; - - dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); - if (dbd == NULL || dbd->dbd_index >= dbd->dbd_cap) { - rc = vos_dtx_extend_act_table(cont); - if (rc != 0) - goto out; - - dbd = umem_off2ptr(umm, cont_df->cd_dtx_active_tail); - } - - D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); - - dae->dae_df_off = cont_df->cd_dtx_active_tail + - offsetof(struct vos_dtx_blob_df, dbd_active_data) + - sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; - dae->dae_dbd = dbd; - dth->dth_active = 1; + rc = vos_dtx_active(dth); + if (rc != 0) + goto out; } rc = vos_dtx_append(dth, record, type); @@ -1526,10 +1552,11 @@ vos_dtx_register_record(struct umem_instance *umm, umem_off_t record, } out: - D_DEBUG(DB_TRACE, "Register DTX record for "DF_DTI - ": lid=%d entry %p, type %d, %s ilog entry, rc %d\n", DP_DTI(&dth->dth_xid), - dth->dth_ent == NULL ? 0 : DAE_LID((struct vos_dtx_act_ent *)dth->dth_ent), - dth->dth_ent, type, dth->dth_modify_shared ? "has" : "has not", rc); + DL_CDEBUG(rc == 0 || rc == -DER_ALREADY, DB_TRACE, DLOG_ERR, rc, + "Register DTX record for "DF_DTI": lid=%d entry %p, type %d, %s ilog entry", + DP_DTI(&dth->dth_xid), + dth->dth_ent == NULL ? 0 : DAE_LID((struct vos_dtx_act_ent *)dth->dth_ent), + dth->dth_ent, type, dth->dth_modify_shared ? "has" : "has not"); return rc; } @@ -1626,8 +1653,8 @@ vos_dtx_deregister_record(struct umem_instance *umm, daos_handle_t coh, int vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) { - struct vos_dtx_act_ent *dae; - struct vos_container *cont; + struct vos_dtx_act_ent *dae = dth->dth_ent; + struct vos_container *cont = vos_hdl2cont(dth->dth_coh); struct umem_instance *umm; struct vos_dtx_blob_df *dbd; umem_off_t rec_off; @@ -1635,18 +1662,31 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) int count; int rc = 0; - if (!dth->dth_active) - return 0; - - cont = vos_hdl2cont(dth->dth_coh); - D_ASSERT(cont != NULL); - - dae = dth->dth_ent; /* There must be vos_dtx_attach() before prepared. */ D_ASSERT(dae != NULL); + D_ASSERT(cont != NULL); D_ASSERT(dae->dae_aborting == 0); D_ASSERT(dae->dae_aborted == 0); + if (!dth->dth_active) { + /* For resend case, do nothing. */ + if (likely(dth->dth_prepared)) + return 0; + + /* + * Even if the transaction modifies nothing locally, we still need to store + * it persistently. Otherwise, the subsequent DTX resync may not find it as + * to regard it as failed transaction and abort it. + */ + rc = vos_dtx_active(dth); + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Active empty transaction " DF_DTI, DP_DTI(&dth->dth_xid)); + + if (rc != 0) + return rc; + } + if (dth->dth_solo) { if (dth->dth_drop_cmt) /* The active DTX entry will be removed via vos_dtx_post_handle() @@ -1716,7 +1756,7 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) DAE_MBS_DSIZE(dae)); } else { rec_off = umem_zalloc(umm, DAE_MBS_DSIZE(dae)); - if (umoff_is_null(rec_off)) { + if (UMOFF_IS_NULL(rec_off)) { D_ERROR("No space to store DTX mbs " DF_DTI"\n", DP_DTI(&DAE_XID(dae))); return -DER_NOSPACE; @@ -1733,7 +1773,7 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) size = sizeof(umem_off_t) * count; rec_off = umem_zalloc(umm, size); - if (umoff_is_null(rec_off)) { + if (UMOFF_IS_NULL(rec_off)) { D_ERROR("No space to store active DTX "DF_DTI"\n", DP_DTI(&DAE_XID(dae))); return -DER_NOSPACE; @@ -1763,6 +1803,7 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) dbd->dbd_index++; dae->dae_preparing = 1; + dae->dae_need_release = 1; return 0; } @@ -1770,6 +1811,7 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) static struct dtx_memberships * vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) { + struct dtx_handle *dth = dae->dae_dth; struct dtx_memberships *tmp; size_t size; @@ -1783,7 +1825,11 @@ vos_dtx_pack_mbs(struct umem_instance *umm, struct vos_dtx_act_ent *dae) tmp->dm_data_size = DAE_MBS_DSIZE(dae); tmp->dm_flags = DAE_MBS_FLAGS(dae); tmp->dm_dte_flags = DAE_FLAGS(dae); - if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) + + /* The DTX is not prepared yet, copy the MBS from DTX handle. */ + if (dth != NULL) + memcpy(tmp->dm_data, dth->dth_mbs->dm_data, tmp->dm_data_size); + else if (tmp->dm_data_size <= sizeof(DAE_MBS_INLINE(dae))) memcpy(tmp->dm_data, DAE_MBS_INLINE(dae), tmp->dm_data_size); else memcpy(tmp->dm_data, umem_off2ptr(umm, DAE_MBS_OFF(dae)), @@ -1905,10 +1951,12 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, } int -vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships **mbs) +vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, + struct dtx_memberships **mbs) { struct vos_container *cont; struct dtx_memberships *tmp; + struct vos_dtx_act_ent *dae; d_iov_t kiov; d_iov_t riov; int rc; @@ -1920,14 +1968,24 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, struct dtx_memberships * d_iov_set(&riov, NULL, 0); rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); if (rc == 0) { - tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), riov.iov_buf); - if (tmp == NULL) + dae = riov.iov_buf; + tmp = vos_dtx_pack_mbs(vos_cont2umm(cont), dae); + if (tmp == NULL) { rc = -DER_NOMEM; - else + } else { + if (oid != NULL) + *oid = DAE_OID(dae); *mbs = tmp; + } + } else if (rc == -DER_NONEXIST) { + rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); + if (rc == 0) + rc = 1; + else if (rc == -DER_NONEXIST && !cont->vc_cmt_dtx_indexed) + rc = -DER_INPROGRESS; } - if (rc != 0) + if (rc < 0) D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); return rc; @@ -2019,7 +2077,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], dbd_prev = dbd; /* Need new @dbd */ dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); - if (umoff_is_null(dbd_off)) { + if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space to store committed DTX %d "DF_DTI"\n", count, DP_DTI(&dtis[cur])); fatal = true; @@ -2033,8 +2091,8 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], dbd->dbd_prev = umem_ptr2off(umm, dbd_prev); if (dbd_prev == NULL) { - D_ASSERT(umoff_is_null(cont_df->cd_dtx_committed_head)); - D_ASSERT(umoff_is_null(cont_df->cd_dtx_committed_tail)); + D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)); + D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_tail)); /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_head, @@ -2588,7 +2646,7 @@ vos_dtx_stat(daos_handle_t coh, struct dtx_stat *stat, uint32_t flags) cont_df = cont->vc_cont_df; stat->dtx_newest_aggregated = cont_df->cd_newest_aggregated; - if (!umoff_is_null(cont_df->cd_dtx_committed_head)) { + if (!UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)) { struct umem_instance *umm = vos_cont2umm(cont); struct vos_dtx_blob_df *dbd; struct vos_dtx_cmt_ent_df *dce; @@ -2623,9 +2681,7 @@ vos_dtx_mark_committable(struct dtx_handle *dth) { struct vos_dtx_act_ent *dae = dth->dth_ent; - if (dth->dth_active) { - D_ASSERT(dae != NULL); - + if (dae != NULL) { dae->dae_committable = 1; DAE_FLAGS(dae) &= ~(DTE_CORRUPTED | DTE_ORPHAN); } @@ -2728,6 +2784,7 @@ vos_dtx_act_reindex(struct vos_container *cont) dae->dae_df_off = umem_ptr2off(umm, dae_df); dae->dae_dbd = dbd; dae->dae_prepared = 1; + dae->dae_need_release = 1; D_INIT_LIST_HEAD(&dae->dae_link); if (DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT) { @@ -2737,7 +2794,7 @@ vos_dtx_act_reindex(struct vos_container *cont) count = DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT; size = sizeof(*dae->dae_records) * count; - D_ALLOC(dae->dae_records, size); + D_ALLOC_NZ(dae->dae_records, size); if (dae->dae_records == NULL) { dtx_evict_lid(cont, dae); D_GOTO(out, rc = -DER_NOMEM); @@ -2838,7 +2895,7 @@ vos_dtx_cmt_reindex(daos_handle_t coh) } } - if (dbd->dbd_count < dbd->dbd_cap || umoff_is_null(dbd->dbd_next)) + if (dbd->dbd_count < dbd->dbd_cap || UMOFF_IS_NULL(dbd->dbd_next)) D_GOTO(out, rc = 1); cont->vc_cmt_dtx_reindex_pos = dbd->dbd_next; @@ -2870,64 +2927,44 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) return; cont = vos_hdl2cont(dth->dth_coh); + dae = dth->dth_ent; if (dth->dth_pinned) { /* Only keep the DTX entry (header) for handling resend RPC, * remove DTX records, purge related VOS objects from cache. */ - dae = dth->dth_ent; if (dae != NULL) dtx_act_ent_cleanup(cont, dae, dth, true); } else { d_iov_set(&kiov, &dth->dth_xid, sizeof(dth->dth_xid)); d_iov_set(&riov, NULL, 0); - rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); - if (rc == -DER_NONEXIST) { - rc = dbtree_lookup(cont->vc_dtx_committed_hdl, &kiov, &riov); - /* Cannot cleanup 'committed' DTX entry. */ - if (rc == 0) - goto out; - } - if (rc != 0) { - if (rc != -DER_NONEXIST) { - D_ERROR("Fail to remove DTX entry "DF_DTI":" - DF_RC"\n", - DP_DTI(&dth->dth_xid), DP_RC(rc)); - - dae = dth->dth_ent; - if (dae != NULL) { - /* Cannot cleanup 'prepare'/'commit' DTX entry. */ - if (vos_dae_is_prepare(dae) || vos_dae_is_commit(dae)) - goto out; - - dae->dae_aborted = 1; - } - } else { - rc = 0; - } - } else { + if (dae == NULL) { + rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); + /* Need not search committed table, since cannot cleanup 'committed' one. */ + if (rc != 0) + return; + dae = (struct vos_dtx_act_ent *)riov.iov_buf; - if (dth->dth_ent != NULL) - D_ASSERT(dth->dth_ent == dae); + } - /* Cannot cleanup 'prepare'/'commit' DTX entry. */ - if (vos_dae_is_prepare(dae) || vos_dae_is_commit(dae)) - goto out; + /* Cannot cleanup 'prepare'/'commit' DTX entry. */ + if (vos_dae_is_prepare(dae) || vos_dae_is_commit(dae)) + goto out; - /* Skip the @dae if it belong to another instance for resent request. */ - if (DAE_EPOCH(dae) != dth->dth_epoch) - goto out; + /* Skip the @dae if it belong to another instance for resent request. */ + if (DAE_EPOCH(dae) != dth->dth_epoch) + goto out; - rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_BYPASS, &kiov, &dae); - D_ASSERT(rc == 0); - } + dtx_act_ent_cleanup(cont, dae, dth, true); - if (dae != NULL) { - dtx_act_ent_cleanup(cont, dae, dth, true); - if (rc == 0) - dtx_evict_lid(cont, dae); - } + rc = dbtree_delete(cont->vc_dtx_active_hdl, + riov.iov_buf != NULL ? BTR_PROBE_BYPASS : BTR_PROBE_EQ, + &kiov, &dae); + if (rc == 0 || rc == -DER_NONEXIST) + dtx_evict_lid(cont, dae); + else + dae->dae_aborted = 1; out: dth->dth_ent = NULL; @@ -2945,7 +2982,7 @@ vos_dtx_cleanup(struct dtx_handle *dth, bool unpin) dae = dth->dth_ent; if (dae == NULL) { - if (!dth->dth_active) + if (!dth->dth_active && !unpin) return; } else { /* 'prepared'/'preparing' DTX can be either committed or aborted, not cleanup. */ @@ -2978,7 +3015,7 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist) return 0; cont = vos_hdl2cont(dth->dth_coh); - D_ASSERT(cont != NULL); + umm = vos_cont2umm(cont); if (dth->dth_ent != NULL) { D_ASSERT(persistent); @@ -3005,7 +3042,6 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist) } if (persistent) { - umm = vos_cont2umm(cont); rc = umem_tx_begin(umm, NULL); if (rc != 0) goto out; @@ -3022,7 +3058,7 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist) } if (dth->dth_ent == NULL) { - rc = vos_dtx_alloc(dbd, dth); + rc = vos_dtx_alloc(umm, dbd, dth); } else if (persistent) { D_ASSERT(dbd != NULL); D_ASSERT(dbd->dbd_magic == DTX_ACT_BLOB_MAGIC); @@ -3030,9 +3066,9 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist) dae = dth->dth_ent; D_ASSERT(dae->dae_dbd == NULL); - dae->dae_df_off = cont->vc_cont_df->cd_dtx_active_tail + - offsetof(struct vos_dtx_blob_df, dbd_active_data) + - sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; + dae->dae_df_off = umem_ptr2off(umm, dbd) + + offsetof(struct vos_dtx_blob_df, dbd_active_data) + + sizeof(struct vos_dtx_act_ent_df) * dbd->dbd_index; dae->dae_dbd = dbd; } diff --git a/src/vos/vos_ilog.c b/src/vos/vos_ilog.c index dea2a22f28f..1a13e4d6a28 100644 --- a/src/vos/vos_ilog.c +++ b/src/vos/vos_ilog.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -428,7 +428,7 @@ int vos_ilog_update_(struct vos_container *cont, struct ilog_df *ilog, } vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); - rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, &loh); + rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; @@ -525,7 +525,7 @@ vos_ilog_punch_(struct vos_container *cont, struct ilog_df *ilog, punch_log: vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); - rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, &loh); + rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 819fe7b4186..3fede0acd83 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -405,6 +405,7 @@ struct vos_dtx_act_ent { dae_maybe_shared:1, /* Need validation on leader before commit/committable. */ dae_need_validation:1, + dae_need_release:1, dae_preparing:1, dae_prepared:1; }; @@ -1585,12 +1586,6 @@ vos_exec(void (*func)(void *), void *arg) return 0; } -static inline bool -umoff_is_null(umem_off_t umoff) -{ - return umoff == UMOFF_NULL; -} - /* vos_csum_recalc.c */ struct csum_recalc { diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index 58d1dba1021..d1b2a713d03 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -288,7 +288,7 @@ vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, if (!log) goto skip_log; vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); - rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, &loh); + rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, dth == NULL, &loh); if (rc != 0) return rc; diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index e265732b447..c58f11d5d74 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -449,27 +449,6 @@ # - wal # # -# # Specify accelerator engine setting (experimental). -# -# # Acceleration engine options are: -# # - "none" do not use an acceleration engine. (default) -# # - "spdk" to assigned management of hardware/software acceleration to SPDK. -# # - "dml" te set DML as accelerator engine. -# -# # Optional capability settings are: -# # - "move" to enable acceleration of MOVE instructions. -# # - "crc" to enable acceleration of CRC instructions. -# -# # If acceleration engine setting is "none" (or unset) then optional capabilities are all set to -# # false. If set to "spdk" or "dml" then optional capabilities are set to "true" by default. -# -# acceleration: -# engine: spdk -# options: -# - move -# - crc -# -# #- # # Number of I/O service threads (and network endpoints) per engine. # # Immutable after running "dmg storage format". @@ -617,23 +596,3 @@ # # # See about bdev_roles above. # bdev_roles: [wal, meta, data] -# -# -# # Specify accelerator engine setting (experimental). -# -# # Acceleration engine options are: -# # - "none" do not use an acceleration engine. (default) -# # - "spdk" to assigned management of hardware/software acceleration to SPDK. -# # - "dml" te set DML as accelerator engine. -# -# # Optional capability settings are: -# # - "move" to enable acceleration of MOVE instructions. -# # - "crc" to enable acceleration of CRC instructions. -# -# # If acceleration engine setting is "none" (or unset) then optional capabilities are all set to -# # false. If set to "spdk" or "dml" then optional capabilities are set to "true" by default. -# -# acceleration: -# engine: dml -# options: -# - crc diff --git a/utils/cq/daos_pylint.py b/utils/cq/daos_pylint.py index 22e1fbc997c..3c9ca6633c1 100755 --- a/utils/cq/daos_pylint.py +++ b/utils/cq/daos_pylint.py @@ -547,10 +547,6 @@ def main(): print(full_version) sys.exit(0) - if args.diff: - print('This option is no longer used') - sys.exit(1) - rc_tmp = None # If spellings are likely supported and using the default configuration file then enable using diff --git a/utils/cq/requirements.txt b/utils/cq/requirements.txt index bcbda5fc64d..5da24fa4c56 100644 --- a/utils/cq/requirements.txt +++ b/utils/cq/requirements.txt @@ -9,7 +9,7 @@ pyenchant ## flake8 6 removed --diff option which breaks flake precommit hook. ## https://github.com/pycqa/flake8/issues/1389 https://github.com/PyCQA/flake8/pull/1720 flake8<6.0.0 -isort==5.12 -pylint==3.0.2 +isort==5.13.2 +pylint==3.0.3 yamllint==1.33.0 codespell==2.2.6 diff --git a/utils/cq/words.dict b/utils/cq/words.dict index e062dc6dcc9..d6a5f6f80a6 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -315,6 +315,7 @@ params patchelf pci pda +parallelized pdesc perf performant @@ -334,6 +335,7 @@ ppn prebuild prebuilding prebuilt +predefine preload prepend prepended @@ -366,6 +368,7 @@ recurse redistributions refactor repo +reproducibility returncode rf rmdir @@ -462,12 +465,14 @@ uint umount umounting unicode +uninterruptible unittest unlink unlinked unlinking unmangle unmount +unordered unwritable uri url @@ -484,6 +489,7 @@ uuidstr valgrind vendored ver +versa versioned vm vos diff --git a/utils/test_memcheck.supp b/utils/test_memcheck.supp index 4f0d88fb28d..d1260ecf572 100644 --- a/utils/test_memcheck.supp +++ b/utils/test_memcheck.supp @@ -383,3 +383,9 @@ ... fun:mdb_txn_commit } +{ + go_runtime_syscall_param + Memcheck:Param + write(buf) + fun:runtime/internal/syscall.Syscall6 +}