From 76cfb41b852f94859a32caaa7b3b379f069d0881 Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Tue, 15 Oct 2024 10:18:18 -0500 Subject: [PATCH 01/26] DAOS-16556 client: call fstat() before mmap() to update file status in kernel (#15304) Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/int_dfs.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index fff49d9f0ae7..3087cb09c75f 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -6224,6 +6224,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) atomic_fetch_add_relaxed(&num_mmap, 1); + if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) { + /* DAOS-14494: Force the kernel to update the size before mapping. */ + rc = next_fxstat(1, fd, &stat_buf); + if (rc == -1) + return MAP_FAILED; + return next_mmap(addr, length, prot, flags, fd, offset); + } + addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset); if (addr_ret == MAP_FAILED) return MAP_FAILED; @@ -6842,10 +6850,8 @@ init_myhook(void) register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl)); - if (d_compatible_mode == false) { - register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); - register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); - } + register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); + register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); From b4eb68912b13345905efc98bdf2ed9b1c948113d Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Wed, 16 Oct 2024 09:40:29 +0900 Subject: [PATCH 02/26] =?UTF-8?q?DAOS-16446=20test:=20HDF5-VOL=20test=20-?= =?UTF-8?q?=20Set=20object=20class=20and=20container=20prope=E2=80=A6=20(#?= =?UTF-8?q?15004)=20(#15098)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In HDF5, DFS, MPIIO, or POSIX, object class and container properties are defined during the container create. If it’s DFS, object class is also set to the IOR parameter. However, in HDF5-VOL, object class and container properties are defined with the following environment variables of mpirun. HDF5_DAOS_OBJ_CLASS (Object class) HDF5_DAOS_FILE_PROP (Container properties) The infrastructure to set these variables are already there in run_ior_with_pool(). In file_count_test_base.py, pass in the env vars to run_ior_with_pool(env=env) as a dictionary. Object class is the oclass variable. Container properties can be obtained from self.container.properties.value. This fix is discussed in PR #14964. Signed-off-by: Makito Kano --- src/tests/ftest/util/file_count_test_base.py | 31 ++++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index 12c66d76b8cc..f95e22bd4bdd 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase): :avocado: recursive """ - def add_containers(self, file_oclass=None, dir_oclass=None): - """Create a list of containers that the various jobs use for storage. + def get_file_write_container(self, file_oclass=None, dir_oclass=None): + """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass. Args: - file_oclass (str, optional): file object class of container. - Defaults to None. - dir_oclass (str, optional): dir object class of container. - Defaults to None. + file_oclass (str, optional): file object class of container. Defaults to None. + dir_oclass (str, optional): dir object class of container. Defaults to None. + Returns: + TestContainer: Created container with oclass, dir_oclass, and rd_fac set. """ # Create a container and add it to the overall list of containers @@ -92,7 +92,7 @@ def run_file_count(self): rd_fac = extract_redundancy_factor(oclass) dir_oclass = self.get_diroclass(rd_fac) self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass) - self.container = self.add_containers(oclass, dir_oclass) + self.container = self.get_file_write_container(oclass, dir_oclass) try: self.processes = mdtest_np self.ppn = mdtest_ppn @@ -111,14 +111,27 @@ def run_file_count(self): # run ior self.log.info("=======>>>Starting IOR with %s and %s", api, oclass) self.ior_cmd.dfs_oclass.update(oclass) - self.container = self.add_containers(oclass) + self.container = self.get_file_write_container(oclass) self.update_ior_cmd_with_pool(False) try: self.processes = ior_np self.ppn = ior_ppn if api == 'HDF5-VOL': + # Format the container properties so that it works with HDF5-VOL env var. + # Each entry:value pair needs to be separated by a semicolon. Since we're + # using this in the mpirun command, semicolon would indicate the end of the + # command, so quote the whole thing. + cont_props = self.container.properties.value + cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"' + self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol) + env = self.ior_cmd.env.copy() + env.update({ + "HDF5_DAOS_OBJ_CLASS": oclass, + "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol + }) self.ior_cmd.api.update('HDF5') - self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path) + self.run_ior_with_pool( + create_pool=False, plugin_path=hdf5_plugin_path, env=env) elif self.ior_cmd.api.value == 'POSIX': self.run_ior_with_pool(create_pool=False, intercept=intercept) else: From 6e16c8e21bcc1cb5b29c62aca45e309201c0d9ca Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 16 Oct 2024 02:43:26 +0200 Subject: [PATCH 03/26] DAOS-16673 common: ignore Hadoop 3.4.0 related CVE (#15320) Hadoope 3.4.0 has resolved a few CVE issues but introduces new + enable Trivy scans on release branch + enable on demand scan and scan on final PR merge. Signed-off-by: Tomasz Gromadzki --- .github/workflows/trivy.yml | 70 ++++++++++ utils/trivy/.trivyignore | 27 ++++ utils/trivy/csv.tpl | 29 +++++ utils/trivy/trivy.yaml | 249 ++++++++++++++++++++++++++++++++++++ 4 files changed, 375 insertions(+) create mode 100644 .github/workflows/trivy.yml create mode 100644 utils/trivy/.trivyignore create mode 100644 utils/trivy/csv.tpl create mode 100644 utils/trivy/trivy.yaml diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml new file mode 100644 index 000000000000..8f5524d4513c --- /dev/null +++ b/.github/workflows/trivy.yml @@ -0,0 +1,70 @@ +name: Trivy scan + +on: + workflow_dispatch: + push: + branches: ["master", "release/**"] + pull_request: + branches: ["master", "release/**"] + +# Declare default permissions as nothing. +permissions: {} + +jobs: + build: + name: Build + runs-on: ubuntu-20.04 + steps: + - name: Checkout code + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Prepare the report to be uploaded to the GitHub artifact store + run: | + mkdir report + cp trivy-report-daos.txt report + cp utils/trivy/.trivyignore report/trivyignore.txt + + - name: Upload the report to the GitHub artifact store + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + with: + path: report/* + name: trivy-report-daos + + - name: Adjust config file to use sarif format + run: | + sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \ + utils/trivy/trivy.yaml + sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a + # 3.25.15 (v3) + with: + sarif_file: 'trivy-results.sarif' + + - name: Adjust config file to show and validate scan results + run: | + sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml + sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml + sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore new file mode 100644 index 000000000000..3a3b4cff1cee --- /dev/null +++ b/utils/trivy/.trivyignore @@ -0,0 +1,27 @@ +## Ignored hadoop 3.3.6 related CVE +## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428 +CVE-2023-52428 +## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410 +CVE-2023-39410 +## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710 +CVE-2024-25710 +## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308 +CVE-2024-26308 +## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131 +CVE-2024-29131 +## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133 +CVE-2024-29133 +## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638 +CVE-2024-25638 + +## Ignored hadoop 3.4.0 related CVE +## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561 +CVE-2024-47561 +## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201 +CVE-2023-33201 +## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857 +CVE-2024-29857 +## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171 +CVE-2024-30171 +## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172 +CVE-2024-30172 diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl new file mode 100644 index 000000000000..d3bcafcef5e9 --- /dev/null +++ b/utils/trivy/csv.tpl @@ -0,0 +1,29 @@ +{{ range . }} +Trivy Vulnerability Scan Results ({{- .Target -}}) +VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information +{{ range .Vulnerabilities }} + {{- .VulnerabilityID }}, + {{- .Severity }}, + {{- range $key, $value := .CVSS }} + {{- if (eq $key "nvd") }} + {{- .V3Score -}} + {{- end }} + {{- end }}, + {{- quote .Title }}, + {{- quote .PkgName }}, + {{- quote .InstalledVersion }}, + {{- quote .FixedVersion }}, + {{- .PrimaryURL }} +{{ else -}} + No vulnerabilities found at this time. +{{ end }} +Trivy Dependency Scan Results ({{ .Target }}) +ID,Name,Version,Notes +{{ range .Packages -}} + {{- quote .ID }}, + {{- quote .Name }}, + {{- quote .Version }} +{{ else -}} + No dependencies found at this time. +{{ end }} +{{ end }} \ No newline at end of file diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml new file mode 100644 index 000000000000..cfb13b5c40f4 --- /dev/null +++ b/utils/trivy/trivy.yaml @@ -0,0 +1,249 @@ +cache: + backend: fs + dir: + redis: + ca: "" + cert: "" + key: "" + tls: false + ttl: 0s +config: trivy.yaml +db: + download-java-only: false + download-only: false + java-repository: ghcr.io/aquasecurity/trivy-java-db + java-skip-update: false + no-progress: false + repository: ghcr.io/aquasecurity/trivy-db + skip-update: false +debug: false +dependency-tree: true +exit-code: 0 +generate-default-config: false +ignore-policy: "" +ignorefile: ./utils/trivy/.trivyignore +include-dev-deps: false +insecure: false +license: + confidencelevel: "0.9" + forbidden: + - AGPL-1.0 + - AGPL-3.0 + - CC-BY-NC-1.0 + - CC-BY-NC-2.0 + - CC-BY-NC-2.5 + - CC-BY-NC-3.0 + - CC-BY-NC-4.0 + - CC-BY-NC-ND-1.0 + - CC-BY-NC-ND-2.0 + - CC-BY-NC-ND-2.5 + - CC-BY-NC-ND-3.0 + - CC-BY-NC-ND-4.0 + - CC-BY-NC-SA-1.0 + - CC-BY-NC-SA-2.0 + - CC-BY-NC-SA-2.5 + - CC-BY-NC-SA-3.0 + - CC-BY-NC-SA-4.0 + - Commons-Clause + - Facebook-2-Clause + - Facebook-3-Clause + - Facebook-Examples + - WTFPL + full: false + ignored: [] + notice: + - AFL-1.1 + - AFL-1.2 + - AFL-2.0 + - AFL-2.1 + - AFL-3.0 + - Apache-1.0 + - Apache-1.1 + - Apache-2.0 + - Artistic-1.0-cl8 + - Artistic-1.0-Perl + - Artistic-1.0 + - Artistic-2.0 + - BSL-1.0 + - BSD-2-Clause-FreeBSD + - BSD-2-Clause-NetBSD + - BSD-2-Clause + - BSD-3-Clause-Attribution + - BSD-3-Clause-Clear + - BSD-3-Clause-LBNL + - BSD-3-Clause + - BSD-4-Clause + - BSD-4-Clause-UC + - BSD-Protection + - CC-BY-1.0 + - CC-BY-2.0 + - CC-BY-2.5 + - CC-BY-3.0 + - CC-BY-4.0 + - FTL + - ISC + - ImageMagick + - Libpng + - Lil-1.0 + - Linux-OpenIB + - LPL-1.02 + - LPL-1.0 + - MS-PL + - MIT + - NCSA + - OpenSSL + - PHP-3.01 + - PHP-3.0 + - PIL + - Python-2.0 + - Python-2.0-complete + - PostgreSQL + - SGI-B-1.0 + - SGI-B-1.1 + - SGI-B-2.0 + - Unicode-DFS-2015 + - Unicode-DFS-2016 + - Unicode-TOU + - UPL-1.0 + - W3C-19980720 + - W3C-20150513 + - W3C + - X11 + - Xnet + - Zend-2.0 + - zlib-acknowledgement + - Zlib + - ZPL-1.1 + - ZPL-2.0 + - ZPL-2.1 + permissive: [] + reciprocal: + - APSL-1.0 + - APSL-1.1 + - APSL-1.2 + - APSL-2.0 + - CDDL-1.0 + - CDDL-1.1 + - CPL-1.0 + - EPL-1.0 + - EPL-2.0 + - FreeImage + - IPL-1.0 + - MPL-1.0 + - MPL-1.1 + - MPL-2.0 + - Ruby + restricted: + - BCL + - CC-BY-ND-1.0 + - CC-BY-ND-2.0 + - CC-BY-ND-2.5 + - CC-BY-ND-3.0 + - CC-BY-ND-4.0 + - CC-BY-SA-1.0 + - CC-BY-SA-2.0 + - CC-BY-SA-2.5 + - CC-BY-SA-3.0 + - CC-BY-SA-4.0 + - GPL-1.0 + - GPL-2.0 + - GPL-2.0-with-autoconf-exception + - GPL-2.0-with-bison-exception + - GPL-2.0-with-classpath-exception + - GPL-2.0-with-font-exception + - GPL-2.0-with-GCC-exception + - GPL-3.0 + - GPL-3.0-with-autoconf-exception + - GPL-3.0-with-GCC-exception + - LGPL-2.0 + - LGPL-2.1 + - LGPL-3.0 + - NPL-1.0 + - NPL-1.1 + - OSL-1.0 + - OSL-1.1 + - OSL-2.0 + - OSL-2.1 + - OSL-3.0 + - QPL-1.0 + - Sleepycat + unencumbered: + - CC0-1.0 + - Unlicense + - 0BSD +list-all-pkgs: false +misconfiguration: + cloudformation: + params: [] + helm: + set: [] + set-file: [] + set-string: [] + values: [] + include-non-failures: false + check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0 + # scanners: + # - azure-arm + # - cloudformation + # - dockerfile + # - helm + # - kubernetes + # - terraform + # - terraformplan + terraform: + exclude-downloaded-modules: false + vars: [] +module: + dir: + enable-modules: [] +output: "trivy-report-daos.txt" +format: template +template: '@./utils/trivy/csv.tpl' +output-plugin-arg: "" +quiet: false +registry: + password: [] + token: "" + username: [] +rego: + data: [] + namespaces: [] + policy: [] + skip-policy-update: false + trace: false +report: all +scan: + compliance: "" + file-patterns: [] + offline: false + parallel: 1 + rekor-url: https://rekor.sigstore.dev + sbom-sources: [] + scanners: + - vuln + - secret + # ignore all hadoop dependencies + skip-dirs: + ./src/client/java/hadoop-daos + skip-files: [] + show-suppressed: true +secret: + config: trivy-secret.yaml +server: + addr: "" + custom-headers: [] + token: "" + token-header: Trivy-Token +severity: + - UNKNOWN + - MEDIUM + - HIGH + - CRITICAL +timeout: 5m0s +version: false +vulnerability: + ignore-status: [] + ignore-unfixed: false + type: + - os + - library From d9f16a1d6f7326ace19b1f7d4a6f182b4bde0020 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 16 Oct 2024 02:48:45 +0200 Subject: [PATCH 04/26] DAOS-14408 common: ensure NDCTL not used for storage class `ram` (#15203) * DAOS-14408 common: enable NDCTL for DCPM This PR prepares DAOS to be used with NDCTL enabled in PMDK, which means: - NDCTL must not be used when non-DCPM (simulate PMem) - `storage class: "ram"` is used: `PMEMOBJ_CONF=sds.at_create=0` env variable disables NDCTL features in the PMDK This change affects all tests run on simulated PMem (e.g. inside VMs). Some DOAS utility applications may also require `PMEMOBJ_CONF=sds.at_create=0` to be set. - The default ULT stack size must be at least 20KiB to avoid stack overuse by PMDK with NDCTL enabled and be aligned with Linux page size. `ABT_THREAD_STACKSIZE=20480` env variable is used to increase the default ULT stack size. This env variable is set by control/server module just before engine is started. Much bigger stack is used for pmempool open/create-related tasks e.g. `tgt_vos_create_one` to avoid stack overusage. This modification shall not affect md-on-ssd mode as long as `storage class: "ram"` is used for the first tier in the `storage` configuration. This change does not require any configuration changes to existing systems. The new PMDK package with NDCTL enabled (https://github.com/daos-stack/pmdk/pull/38) will land as soon as this PR is merged. Signed-off-by: Jan Michalski --- .github/workflows/landing-builds.yml | 1 + debian/changelog | 16 ++ debian/control | 6 +- site_scons/components/__init__.py | 1 - src/control/server/engine/config.go | 86 ++++++++++ src/control/server/engine/config_test.go | 207 +++++++++++++++++++++++ src/control/server/server.go | 6 + src/mgmt/srv_target.c | 2 +- utils/build.config | 3 +- utils/rpms/daos.rpmlintrc | 10 +- utils/rpms/daos.spec | 29 +++- utils/run_utest.py | 1 + utils/scripts/install-el8.sh | 2 + utils/scripts/install-el9.sh | 2 + utils/scripts/install-leap15.sh | 1 + utils/scripts/install-ubuntu.sh | 2 + 16 files changed, 361 insertions(+), 14 deletions(-) diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml index 103f150915a4..814c4f0e71f1 100644 --- a/.github/workflows/landing-builds.yml +++ b/.github/workflows/landing-builds.yml @@ -16,6 +16,7 @@ on: - ci/** - requirements-build.txt - requirements-utest.txt + - utils/build.config permissions: {} diff --git a/debian/changelog b/debian/changelog index 6891cea4737f..9790dcbee3e6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,19 @@ +daos (2.6.1-4) unstable; urgency=medium + [ Tomasz Gromadzki ] + * Add support of the PMDK package 2.1.0 with NDCTL enabled. + * Increase the default ULT stack size to 20KiB if the engine uses + the DCPM storage class. + * Prevent using the RAM storage class (simulated PMem) when + the shutdown state (SDS) is active. + * Automatically disable SDS for the RAM storage class on engine startup. + * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' + environment variable to deactivate SDS for the DAOS tools + (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. + Otherwise, a user is supposed to be stopped by an error + like: "Unsafe shutdown count is not supported for this source". + + -- Tomasz Gromadzki Wed, 02 Oct 2024 12:00:00 +0200 + daos (2.6.1-3) unstable; urgency=medium [ Phillip Henderson ] * Third release candidate for 2.6.1 diff --git a/debian/control b/debian/control index bdc377e6d94a..32b97fd67d18 100644 --- a/debian/control +++ b/debian/control @@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10), python3-distro, libabt-dev, libucx-dev, - libpmemobj-dev (>= 2.0.0), + libpmemobj-dev (>= 2.1.0), libfuse3-dev, libprotobuf-c-dev, libjson-c-dev, @@ -118,7 +118,9 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml, daos-client (= ${binary:Version}), daos-admin (= ${binary:Version}), golang-go (>=1.18), - libcapstone-dev + libcapstone-dev, + libndctl-dev, + libdaxctl-dev Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source software-defined object store designed from the ground up for massively distributed Non Volatile Memory (NVM). DAOS takes advantage diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py index ed50b80461b5..1ae7bb7a2aa3 100644 --- a/site_scons/components/__init__.py +++ b/site_scons/components/__init__.py @@ -266,7 +266,6 @@ def define_components(reqs): retriever=GitRepoRetriever(), commands=[['make', 'all', - 'NDCTL_ENABLE=n', 'BUILD_EXAMPLES=n', 'BUILD_BENCHMARKS=n', 'DOC=n', diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index 3e6ab0b64d79..38595fd8091a 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -9,6 +9,7 @@ package engine import ( "fmt" "os" + "strconv" "strings" "github.com/pkg/errors" @@ -28,6 +29,8 @@ const ( envLogMasks = "D_LOG_MASK" envLogDbgStreams = "DD_MASK" envLogSubsystems = "DD_SUBSYS" + + minABTThreadStackSizeDCPM = 20480 ) // FabricConfig encapsulates networking fabric configuration. @@ -342,7 +345,80 @@ func (c *Config) Validate() error { if err := ValidateLogSubsystems(subsystems); err != nil { return errors.Wrap(err, "validate engine log subsystems") } + return nil +} + +// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class. +func (c *Config) UpdatePMDKEnvarsStackSizeDCPM() error { + stackSizeStr, err := c.GetEnvVar("ABT_THREAD_STACKSIZE") + if err != nil { + c.EnvVars = append(c.EnvVars, fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", + minABTThreadStackSizeDCPM)) + return nil + } + // Ensure at least 20KiB ABT stack size for an engine with DCPM storage class. + stackSizeValue, err := strconv.Atoi(stackSizeStr) + if err != nil { + return errors.Errorf("env_var ABT_THREAD_STACKSIZE has invalid value: %s", + stackSizeStr) + } + if stackSizeValue < minABTThreadStackSizeDCPM { + return errors.Errorf("env_var ABT_THREAD_STACKSIZE should be >= %d "+ + "for DCPM storage class, found %d", minABTThreadStackSizeDCPM, + stackSizeValue) + } + return nil +} + +// Ensure proper configuration of shutdown (SDS) state +func (c *Config) UpdatePMDKEnvarsPMemobjConf(isDCPM bool) error { + pmemobjConfStr, pmemobjConfErr := c.GetEnvVar("PMEMOBJ_CONF") + //also work for empty string + hasSdsAtCreate := strings.Contains(pmemobjConfStr, "sds.at_create") + if isDCPM { + if !hasSdsAtCreate { + return nil + } + // Confirm default handling of shutdown state (SDS) for DCPM storage class. + return errors.New("env_var PMEMOBJ_CONF should NOT contain 'sds.at_create=?' " + + "for DCPM storage class, found '" + pmemobjConfStr + "'") + } + + // Disable shutdown state (SDS) (part of RAS) for RAM-based simulated SCM. + if pmemobjConfErr != nil { + c.EnvVars = append(c.EnvVars, "PMEMOBJ_CONF=sds.at_create=0") + return nil + } + if !hasSdsAtCreate { + envVars, _ := common.DeleteKeyValue(c.EnvVars, "PMEMOBJ_CONF") + c.EnvVars = append(envVars, "PMEMOBJ_CONF="+pmemobjConfStr+ + ";sds.at_create=0") + return nil + } + if strings.Contains(pmemobjConfStr, "sds.at_create=1") { + return errors.New("env_var PMEMOBJ_CONF should contain 'sds.at_create=0' " + + "for non-DCPM storage class, found '" + pmemobjConfStr + "'") + } + return nil +} + +// Ensure proper environment variables for PMDK w/ NDCTL enabled based on +// the actual configuration of the storage class. +func (c *Config) UpdatePMDKEnvars() error { + + if len(c.Storage.Tiers) == 0 { + return errors.New("Invalid config - no tier 0 defined") + } + + isDCPM := c.Storage.Tiers[0].Class == storage.ClassDcpm + if err := c.UpdatePMDKEnvarsPMemobjConf(isDCPM); err != nil { + return err + } + + if isDCPM { + return c.UpdatePMDKEnvarsStackSizeDCPM() + } return nil } @@ -690,3 +766,13 @@ func (c *Config) WithStorageIndex(i uint32) *Config { c.Storage.EngineIdx = uint(i) return c } + +// WithEnvVarAbtThreadStackSize sets environment variable ABT_THREAD_STACKSIZE. +func (c *Config) WithEnvVarAbtThreadStackSize(stack_size uint16) *Config { + return c.WithEnvVars(fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", stack_size)) +} + +// WithEnvVarPMemObjSdsAtCreate sets PMEMOBJ_CONF env. var. to sds.at_create=0/1 value +func (c *Config) WithEnvVarPMemObjSdsAtCreate(value uint8) *Config { + return c.WithEnvVars(fmt.Sprintf("PMEMOBJ_CONF=sds.at_create=%d", value)) +} diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go index 463e86567dcd..d9ca2bfebbd2 100644 --- a/src/control/server/engine/config_test.go +++ b/src/control/server/engine/config_test.go @@ -1104,3 +1104,210 @@ func TestFabricConfig_Update(t *testing.T) { }) } } + +func TestConfig_UpdatePMDKEnvarsStackSizeDCPM(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + expABTthreadStackSize int + }{ + "empty config should not fail": { + cfg: MockConfig(), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "valid config for DCPM should not fail": { + cfg: validConfig().WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "config for DCPM without thread size should not fail": { + cfg: validConfig(), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "config for DCPM with stack size big enough should not fail": { + cfg: validConfig(). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM + 1), + expABTthreadStackSize: minABTThreadStackSizeDCPM + 1, + }, + "config for DCPM with stack size too small should fail": { + cfg: validConfig(). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expErr: errors.New(fmt.Sprintf("env_var ABT_THREAD_STACKSIZE "+ + "should be >= %d for DCPM storage class, found %d", + minABTThreadStackSizeDCPM, minABTThreadStackSizeDCPM-1)), + }, + "config for DCPM with invalid ABT_THREAD_STACKSIZE value should fail": { + cfg: validConfig().WithEnvVars("ABT_THREAD_STACKSIZE=foo_bar"), + expErr: errors.New("env_var ABT_THREAD_STACKSIZE has invalid value: foo_bar"), + }, + } { + t.Run(name, func(t *testing.T) { + err := tc.cfg.UpdatePMDKEnvarsStackSizeDCPM() + test.CmpErr(t, tc.expErr, err) + if err == nil { + stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE") + test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE") + stackSizeVal, err := strconv.Atoi(stackSizeStr) + test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE") + test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal, + "Invalid ABT_THREAD_STACKSIZE value") + } + }) + } +} + +func TestConfig_UpdatePMDKEnvarsPMemobjConfDCPM(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig().WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + }{ + "empty config should not fail": { + cfg: MockConfig(), + }, + "valid config for DCPM should not fail": { + cfg: validConfig(), + }, + "config for DCPM with forced sds.at_create (1) should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " + + "'sds.at_create=?' for DCPM storage class, found 'sds.at_create=1'"), + }, + "config for DCPM with forced sds.at_create (0) should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0), + expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " + + "'sds.at_create=?' for DCPM storage class, found 'sds.at_create=0'"), + }, + } { + t.Run(name, func(t *testing.T) { + test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(true)) + }) + } +} + +func TestConfig_UpdatePMDKEnvarsPMemobjConfNRam(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + expPMEMOBJ_CONF string + }{ + "empty config should not fail": { + cfg: validConfig(), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "config for ram without PMEMOBJ_CONF should not fail": { + cfg: MockConfig(), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "valid config for should not fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "config for ram w/ PMEMOBJ_CONF w/o sds.at_create should should be updated": { + cfg: validConfig().WithEnvVars("PMEMOBJ_CONF=foo_bar"), + expPMEMOBJ_CONF: "foo_bar;sds.at_create=0", + }, + "config for ram with sds.at_create set to 1 should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class" + + ", found 'sds.at_create=1'"), + }, + "config for ram w/ PMEMOBJ_CONF w/ sds.at_create=1 should fail": { + cfg: validConfig(). + WithEnvVars("PMEMOBJ_CONF=sds.at_create=1;foo-bar"), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class" + + ", found 'sds.at_create=1;foo-bar'"), + }, + } { + t.Run(name, func(t *testing.T) { + test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(false)) + if len(tc.expPMEMOBJ_CONF) > 0 { + sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF") + test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF") + test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create, + "Invalid PMEMOBJ_CONF") + } + + }) + } +} + +func TestConfig_UpdatePMDKEnvars(t *testing.T) { + validConfig := func(storageclas string) *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass(storageclas)) + } + for name, tc := range map[string]struct { + cfg *Config + expErr error + expPMEMOBJ_CONF string + expABTthreadStackSize int + }{ + "empty config should fail": { + cfg: MockConfig(), + expErr: errors.New("Invalid config - no tier 0 defined"), + expABTthreadStackSize: -1, + }, + "valid config for RAM should not fail": { + cfg: validConfig("ram"). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expPMEMOBJ_CONF: "sds.at_create=0", + expABTthreadStackSize: minABTThreadStackSizeDCPM - 1, + }, + "invalid config for RAM should fail": { + cfg: validConfig("ram").WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class, " + + "found 'sds.at_create=1'"), + expABTthreadStackSize: -1, + }, + "valid config for DCPM should not fail": { + cfg: validConfig("dcpm"), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "invalid config for DCPM should not fail": { + cfg: validConfig("dcpm"). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expErr: errors.New("env_var ABT_THREAD_STACKSIZE should be >= 20480 " + + "for DCPM storage class, found 20479"), + expABTthreadStackSize: minABTThreadStackSizeDCPM - 1, + }, + } { + t.Run(name, func(t *testing.T) { + errTc := tc.cfg.UpdatePMDKEnvars() + test.CmpErr(t, tc.expErr, errTc) + if len(tc.expPMEMOBJ_CONF) > 0 { + sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF") + test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF") + test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create, + "Invalid PMEMOBJ_CONF") + } + if tc.expABTthreadStackSize >= 0 { + stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE") + test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE") + stackSizeVal, err := strconv.Atoi(stackSizeStr) + test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE") + test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal, + "Invalid ABT_THREAD_STACKSIZE value") + } + }) + } +} diff --git a/src/control/server/server.go b/src/control/server/server.go index 4ac8ce5e04be..c9232dd1725c 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -103,6 +103,12 @@ func processConfig(log logging.Logger, cfg *config.Server, fis *hardware.FabricI return err } + for _, ec := range cfg.Engines { + if err := ec.UpdatePMDKEnvars(); err != nil { + return err + } + } + return nil } diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index b03215d08aac..e1d6ee79bce8 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1180,7 +1180,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) /* A zero size accommodates the existing file */ vpa.vpa_scm_size = 0; vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr; - rc = dss_thread_collective(tgt_vos_create_one, &vpa, 0); + rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK); if (rc) { D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n", DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); diff --git a/utils/build.config b/utils/build.config index 080ebae12830..52749b3f1ce5 100644 --- a/utils/build.config +++ b/utils/build.config @@ -4,7 +4,7 @@ component=daos [commit_versions] argobots=v1.1 fuse=fuse-3.16.2 -pmdk=2.0.0 +pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 @@ -30,3 +30,4 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch +pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc index 09022a74e59b..bbf0f9e2b925 100644 --- a/utils/rpms/daos.rpmlintrc +++ b/utils/rpms/daos.rpmlintrc @@ -47,10 +47,10 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a" addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so") # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected. -addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel") # a functional test builds daos from source, so it needs the various *-devel packages for daos' build dependencies. -addFilter("daos-client-tests.x86_64: E: devel-dependency capstone-devel") -addFilter("daos-client-tests.x86_64: E: explicit-lib-dependency libcapstone-devel") -addFilter("daos-client-tests.x86_64: E: devel-dependency libcapstone-devel") -addFilter("daos-client-tests.x86_64: E: devel-dependency fuse3-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency capstone-devel") +addFilter("daos-client-tests\.x86_64: E: explicit-lib-dependency lib(capstone|ndctl)-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency libcapstone-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency fuse3-devel") diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 14fa74776a22..29576bc71a1b 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.6.1 -Release: 3%{?relval}%{?dist} +Release: 4%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -49,7 +49,7 @@ BuildRequires: libabt-devel >= 1.0rc1 BuildRequires: libjson-c-devel BuildRequires: boost-devel %endif -BuildRequires: libpmemobj-devel >= 2.0.0 +BuildRequires: libpmemobj-devel >= 2.1.0 %if (0%{?rhel} >= 8) BuildRequires: fuse3-devel >= 3 %else @@ -147,11 +147,11 @@ Requires: ndctl # needed to set PMem configuration goals in BIOS through control-plane %if (0%{?suse_version} >= 1500) Requires: ipmctl >= 03.00.00.0423 -Requires: libpmemobj1 >= 2.0.0-1.suse1500 +Requires: libpmemobj1 >= 2.1.0-1.suse1500 Requires: libfabric1 >= %{libfabric_version} %else Requires: ipmctl >= 03.00.00.0468 -Requires: libpmemobj >= 2.0.0-1%{?dist} +Requires: libpmemobj >= 2.1.0-1%{?dist} %endif Requires: libfabric >= %{libfabric_version} Requires: mercury >= %{mercury_version} @@ -232,6 +232,13 @@ Requires: fuse3-devel >= 3 Requires: fuse3-devel >= 3.4.2 %endif Requires: pciutils-devel +%if (0%{?suse_version} > 0) +Requires: libndctl-devel +%endif +%if (0%{?rhel} >= 8) +Requires: ndctl-devel +Requires: daxctl-devel +%endif %description client-tests This is the package needed to run the DAOS test suite (client tests) @@ -591,6 +598,20 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog + +* Wed Oct 02 2024 Tomasz Gromadzki 2.6.1-4 +- Add support of the PMDK package 2.1.0 with NDCTL enabled. + * Increase the default ULT stack size to 20KiB if the engine uses + the DCPM storage class. + * Prevent using the RAM storage class (simulated PMem) when + the shutdown state (SDS) is active. + * Automatically disable SDS for the RAM storage class on engine startup. + * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' + environment variable to deactivate SDS for the DAOS tools + (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. + Otherwise, a user is supposed to be stopped by an error + like: "Unsafe shutdown count is not supported for this source". + * Tue Oct 01 2024 Phillip Henderson 2.6.1-3 - Third release candidate for 2.6.1 diff --git a/utils/run_utest.py b/utils/run_utest.py index c2f4ffd002dd..1835f230e362 100755 --- a/utils/run_utest.py +++ b/utils/run_utest.py @@ -440,6 +440,7 @@ def run(self, base, memcheck, sudo): cmd = new_cmd self.last = cmd + self.env.update({"PMEMOBJ_CONF": "sds.at_create=0"}) if self.suite.gha: retval = run_cmd(cmd, env=self.env) else: diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh index 81a044bfddb0..472f88c99255 100755 --- a/utils/scripts/install-el8.sh +++ b/utils/scripts/install-el8.sh @@ -20,6 +20,7 @@ dnf --nodocs install \ clang-tools-extra \ cmake \ CUnit-devel \ + daxctl-devel \ diffutils \ e2fsprogs \ file \ @@ -48,6 +49,7 @@ dnf --nodocs install \ lz4-devel \ make \ ndctl \ + ndctl-devel \ numactl \ numactl-devel \ openmpi-devel \ diff --git a/utils/scripts/install-el9.sh b/utils/scripts/install-el9.sh index 9ddd8c257d60..092de8eba0f0 100755 --- a/utils/scripts/install-el9.sh +++ b/utils/scripts/install-el9.sh @@ -18,6 +18,7 @@ dnf --nodocs install \ clang-tools-extra \ cmake \ CUnit-devel \ + daxctl-devel \ diffutils \ e2fsprogs \ file \ @@ -47,6 +48,7 @@ dnf --nodocs install \ lz4-devel \ make \ ndctl \ + ndctl-devel \ numactl \ numactl-devel \ openmpi-devel \ diff --git a/utils/scripts/install-leap15.sh b/utils/scripts/install-leap15.sh index fc9826ce508d..0eb8ef44fee4 100755 --- a/utils/scripts/install-leap15.sh +++ b/utils/scripts/install-leap15.sh @@ -38,6 +38,7 @@ dnf --nodocs install \ libjson-c-devel \ libltdl7 \ liblz4-devel \ + libndctl-devel \ libnuma-devel \ libopenssl-devel \ libprotobuf-c-devel \ diff --git a/utils/scripts/install-ubuntu.sh b/utils/scripts/install-ubuntu.sh index ec21c92f474c..a36641d5f101 100755 --- a/utils/scripts/install-ubuntu.sh +++ b/utils/scripts/install-ubuntu.sh @@ -29,11 +29,13 @@ apt-get install \ libcapstone-dev \ libcmocka-dev \ libcunit1-dev \ + libdaxctl-dev \ libfuse3-dev \ libhwloc-dev \ libibverbs-dev \ libjson-c-dev \ liblz4-dev \ + libndctl-dev \ libnuma-dev \ libopenmpi-dev \ libpci-dev \ From 60d4b5dc682463eb61f829b415cc31ea0b993ebc Mon Sep 17 00:00:00 2001 From: Li Wei Date: Fri, 18 Oct 2024 17:39:03 +0900 Subject: [PATCH 05/26] DAOS-16653 pool: Batch crt events (#15230) (#15302) * DAOS-16653 pool: Batch crt events When multiple engines become unavailable around the same time, if a pool cannot tolerate the unavailability of those engines, it is sometimes desired that the pool would not exclude any of the engines. Hence, this patch introduces a CaRT event delay, tunable via the server-side environment variable, CRT_EVENT_DELAY, so that the events signaling the unavailability of those engines will be handled in hopefully one batch, giving pool_svc_update_map_internal a chance to reject the pool map update based on the RF check. When the RF check rejects a pool map change, we should revisit the corresponding events later, rather than simply throwing them away. This patch improves this case by returning the events back to the event queue, and pause the queue handling until next new event or pool map update. - Introduce event sets: pool_svc_event_set. Now the event queue can be simplified to just one event set. - Add the ability to pause and resume the event handling: pse_paused. - Track the time when the latest event was queued: pse_time. Signed-off-by: Li Wei --- docs/admin/env_variables.md | 1 + src/pool/srv_pool.c | 454 +++++++++++++++----- src/tests/ftest/util/server_utils_params.py | 1 + 3 files changed, 346 insertions(+), 110 deletions(-) diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 060c3790d579..31e0f3e87672 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side. |DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.| |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.| |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.| +|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.| |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"| |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.| diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 667e4bc6ed69..2d80088d4b6b 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version) /* Pool service crt event */ struct pool_svc_event { - d_list_t psv_link; d_rank_t psv_rank; uint64_t psv_incarnation; enum crt_event_source psv_src; @@ -72,15 +71,40 @@ struct pool_svc_event { #define DF_PS_EVENT "rank=%u inc="DF_U64" src=%d type=%d" #define DP_PS_EVENT(e) e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type -#define RECHOOSE_SLEEP_MS 250 +/* + * Pool service crt event set + * + * This stores an unordered array of pool_svc_event objects. For all different + * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank. + * + * An event set facilitates the merging of a sequence of events. For instance, + * sequence (in the format ) + * <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D> + * will merge into set + * <3, D>, <5, A>, <1, D> + * (that is, during the merge, an event overrides a previuos event of the same + * rank in the set). + */ +struct pool_svc_event_set { + struct pool_svc_event *pss_buf; + uint32_t pss_len; + uint32_t pss_cap; +}; + +#define DF_PS_EVENT_SET "len=%u" +#define DP_PS_EVENT_SET(s) s->pss_len /* Pool service crt-event-handling state */ struct pool_svc_events { - ABT_mutex pse_mutex; - ABT_cond pse_cv; - d_list_t pse_queue; - ABT_thread pse_handler; - bool pse_stop; + ABT_mutex pse_mutex; + ABT_cond pse_cv; + struct pool_svc_event_set *pse_pending; + uint64_t pse_timeout; /* s */ + uint64_t pse_time; /* s */ + struct sched_request *pse_timer; + ABT_thread pse_handler; + bool pse_stop; + bool pse_paused; }; /* Pool service schedule state */ @@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path) return 0; } +static unsigned int +get_crt_event_delay(void) +{ + unsigned int t = 10 /* s */; + + d_getenv_uint("CRT_EVENT_DELAY", &t); + return t; +} + static int pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) { @@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t)); uuid_copy(svc->ps_uuid, id->iov_buf); - D_INIT_LIST_HEAD(&svc->ps_events.pse_queue); + svc->ps_events.pse_timeout = get_crt_event_delay(); svc->ps_events.pse_handler = ABT_THREAD_NULL; svc->ps_svc_rf = -1; svc->ps_force_notify = false; @@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void) pool_disable_exclude = false; } +static int +alloc_event_set(struct pool_svc_event_set **event_set) +{ + D_ALLOC_PTR(*event_set); + if (*event_set == NULL) + return -DER_NOMEM; + return 0; +} + +static void +free_event_set(struct pool_svc_event_set **event_set) +{ + D_FREE((*event_set)->pss_buf); + D_FREE(*event_set); +} + +static int +add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation, + enum crt_event_source src, enum crt_event_type type) +{ + int i; + + /* Find rank in event_set. */ + for (i = 0; i < event_set->pss_len; i++) + if (event_set->pss_buf[i].psv_rank == rank) + break; + + /* If not found, prepare to add a new event. */ + if (i == event_set->pss_len) { + if (event_set->pss_len == event_set->pss_cap) { + uint32_t cap; + struct pool_svc_event *buf; + + if (event_set->pss_cap == 0) + cap = 1; + else + cap = 2 * event_set->pss_cap; + D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap); + if (buf == NULL) + return -DER_NOMEM; + event_set->pss_buf = buf; + event_set->pss_cap = cap; + } + event_set->pss_len++; + } + + event_set->pss_buf[i].psv_rank = rank; + event_set->pss_buf[i].psv_incarnation = incarnation; + event_set->pss_buf[i].psv_src = src; + event_set->pss_buf[i].psv_type = type; + return 0; +} + +/* Merge next into prev. */ +static int +merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next) +{ + int i; + + for (i = 0; i < next->pss_len; i++) { + struct pool_svc_event *event = &next->pss_buf[i]; + int rc; + + rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src, + event->psv_type); + if (rc != 0) + return rc; + } + return 0; +} + static int queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src, enum crt_event_type type) { struct pool_svc_events *events = &svc->ps_events; - struct pool_svc_event *event; + int rc; + bool allocated = false; - D_ALLOC_PTR(event); - if (event == NULL) - return -DER_NOMEM; + D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank, + incarnation, src, type); - event->psv_rank = rank; - event->psv_incarnation = incarnation; - event->psv_src = src; - event->psv_type = type; + ABT_mutex_lock(events->pse_mutex); - D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + if (events->pse_pending == NULL) { + rc = alloc_event_set(&events->pse_pending); + if (rc != 0) + goto out; + allocated = true; + } + + rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type); + if (rc != 0) + goto out; + + events->pse_time = daos_gettime_coarse(); + + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + } - ABT_mutex_lock(events->pse_mutex); - d_list_add_tail(&event->psv_link, &events->pse_queue); ABT_cond_broadcast(events->pse_cv); + +out: + if (rc != 0 && allocated) + free_event_set(&events->pse_pending); ABT_mutex_unlock(events->pse_mutex); - return 0; + return rc; } static void -discard_events(d_list_t *queue) +resume_event_handling(struct pool_svc *svc) { - struct pool_svc_event *event; - struct pool_svc_event *tmp; + struct pool_svc_events *events = &svc->ps_events; - d_list_for_each_entry_safe(event, tmp, queue, psv_link) { - D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event)); - d_list_del_init(&event->psv_link); - D_FREE(event); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + ABT_cond_broadcast(events->pse_cv); } + ABT_mutex_unlock(events->pse_mutex); } -static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank); +static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set); -static void -handle_event(struct pool_svc *svc, struct pool_svc_event *event) +static int +handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set) { + int i; int rc; - if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) || - (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) { - D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event), - pool_disable_exclude); - goto out; + D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid), + DP_PS_EVENT_SET(event_set)); + + if (!pool_disable_exclude) { + rc = pool_svc_exclude_ranks(svc, event_set); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid)); + return rc; + } } - if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD && - event->psv_type == CRT_EVT_DEAD) { - D_DEBUG(DB_MD, "ignore exclusion of self\n"); - goto out; + /* + * Check if the alive ranks are up in the pool map. If in the future we + * add automatic reintegration below, for instance, we may need + * to not only take svc->ps_lock, but also employ an RDB TX by + * the book. + */ + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE) + continue; + if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) { + /* + * The rank is up in the pool map. Request a pool map + * distribution just in case the rank has recently + * restarted and does not have a copy of the pool map. + */ + ds_rsvc_request_map_dist(&svc->ps_rsvc); + D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n", + DP_UUID(svc->ps_uuid), event->psv_rank); + break; + } } + ABT_rwlock_unlock(svc->ps_pool->sp_lock); - D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + return 0; +} - if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) { - /* - * Check if the rank is up in the pool map. If in the future we - * add automatic reintegration below, for instance, we may need - * to not only take svc->ps_lock, but also employ an RDB TX by - * the book. - */ - ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank); - ABT_rwlock_unlock(svc->ps_pool->sp_lock); - if (!rc) - goto out; +struct event_timer_arg { + struct pool_svc_events *eta_events; + uint64_t eta_deadline; +}; - /* - * The rank is up in the pool map. Request a pool map - * distribution just in case the rank has recently restarted - * and does not have a copy of the pool map. - */ - ds_rsvc_request_map_dist(&svc->ps_rsvc); - D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", - DP_UUID(svc->ps_uuid), event->psv_rank); - } else if (event->psv_type == CRT_EVT_DEAD) { - rc = pool_svc_exclude_rank(svc, event->psv_rank); - if (rc != 0) - D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n", - DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc)); - } +static void +event_timer(void *varg) +{ + struct event_timer_arg *arg = varg; + struct pool_svc_events *events = arg->eta_events; + int64_t time_left = arg->eta_deadline - daos_gettime_coarse(); -out: - return; + if (time_left > 0) + sched_req_sleep(events->pse_timer, time_left * 1000); + ABT_cond_broadcast(events->pse_cv); +} + +static int +start_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + uuid_t uuid; + struct sched_req_attr attr; + + D_ASSERT(events->pse_timer == NULL); + uuid_clear(uuid); + sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid); + events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0); + if (events->pse_timer == NULL) + return -DER_NOMEM; + return 0; +} + +static void +stop_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + + D_ASSERT(events->pse_timer != NULL); + sched_req_wait(events->pse_timer, true /* abort */); + sched_req_put(events->pse_timer); + events->pse_timer = NULL; } static void @@ -1403,31 +1559,83 @@ events_handler(void *arg) D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid)); for (;;) { - struct pool_svc_event *event = NULL; - bool stop; + struct pool_svc_event_set *event_set = NULL; + bool stop; + int rc; ABT_mutex_lock(events->pse_mutex); for (;;) { + struct event_timer_arg timer_arg; + int64_t time_left; + stop = events->pse_stop; if (stop) { - discard_events(&events->pse_queue); + events->pse_paused = false; + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); break; } - if (!d_list_empty(&events->pse_queue)) { - event = d_list_entry(events->pse_queue.next, struct pool_svc_event, - psv_link); - d_list_del_init(&event->psv_link); + + timer_arg.eta_events = events; + timer_arg.eta_deadline = events->pse_time + events->pse_timeout; + + time_left = timer_arg.eta_deadline - daos_gettime_coarse(); + if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) { + event_set = events->pse_pending; + events->pse_pending = NULL; break; } + + /* A simple timed cond_wait without polling. */ + if (time_left > 0) { + rc = start_event_timer(&timer_arg); + if (rc != 0) { + /* No delay then. */ + DL_ERROR(rc, DF_UUID ": failed to start event timer", + DP_UUID(svc->ps_uuid)); + events->pse_time = 0; + continue; + } + } sched_cond_wait(events->pse_cv, events->pse_mutex); + if (time_left > 0) + stop_event_timer(&timer_arg); } ABT_mutex_unlock(events->pse_mutex); if (stop) break; - handle_event(svc, event); + rc = handle_event(svc, event_set); + if (rc != 0) { + /* Put event_set back to events->pse_pending. */ + D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid)); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_pending == NULL) { + /* + * No pending events; pause the handling until + * next event or pool map change. + */ + D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n", + DP_UUID(svc->ps_uuid)); + events->pse_paused = true; + } else { + /* + * There are pending events; do not pause the + * handling. + */ + rc = merge_event_sets(event_set, events->pse_pending); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to merge events", + DP_UUID(svc->ps_uuid)); + free_event_set(&events->pse_pending); + } + events->pse_pending = event_set; + event_set = NULL; + ABT_mutex_unlock(events->pse_mutex); + } - D_FREE(event); + if (event_set != NULL) + free_event_set(&event_set); ABT_thread_yield(); } @@ -1437,7 +1645,7 @@ events_handler(void *arg) static bool events_pending(struct pool_svc *svc) { - return !d_list_empty(&svc->ps_events.pse_queue); + return svc->ps_events.pse_pending != NULL; } static void @@ -1461,9 +1669,11 @@ init_events(struct pool_svc *svc) struct pool_svc_events *events = &svc->ps_events; int rc; - D_ASSERT(d_list_empty(&events->pse_queue)); + D_ASSERT(events->pse_pending == NULL); + D_ASSERT(events->pse_timer == NULL); D_ASSERT(events->pse_handler == ABT_THREAD_NULL); - D_ASSERT(events->pse_stop == false); + D_ASSERT(!events->pse_stop); + D_ASSERT(!events->pse_paused); if (!ds_pool_skip_for_check(svc->ps_pool)) { rc = crt_register_event_cb(ds_pool_crt_event_cb, svc); @@ -1498,7 +1708,8 @@ init_events(struct pool_svc *svc) err_cb: if (!ds_pool_skip_for_check(svc->ps_pool)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); - discard_events(&events->pse_queue); + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); err: return rc; } @@ -1507,7 +1718,6 @@ static void fini_events(struct pool_svc *svc) { struct pool_svc_events *events = &svc->ps_events; - int rc; D_ASSERT(events->pse_handler != ABT_THREAD_NULL); @@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc) ABT_cond_broadcast(events->pse_cv); ABT_mutex_unlock(events->pse_mutex); - rc = ABT_thread_join(events->pse_handler); - D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); ABT_thread_free(&events->pse_handler); events->pse_handler = ABT_THREAD_NULL; events->pse_stop = false; @@ -4332,7 +4540,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks, list_cont_bulk_destroy(bulk); D_FREE(resp_cont); crt_req_decref(rpc); - dss_sleep(RECHOOSE_SLEEP_MS); + dss_sleep(250); D_GOTO(rechoose, rc); } @@ -6501,7 +6709,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma } static int -pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) +pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map) { crt_group_t *primary_grp; struct pool_domain *doms; @@ -6517,13 +6725,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) D_CRIT("!!! Please try to recover these engines in top priority -\n"); D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n"); - D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n", - DP_UUID(svc->ps_uuid), rank); for (i = 0; i < doms_cnt; i++) { struct swim_member_state state; - if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) || - (doms[i].do_comp.co_rank == rank)) + if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN)) continue; rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state); @@ -6701,8 +6906,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, * with CRIT log message to ask administrator to bring back the engine. */ if (src == MUS_SWIM && opc == MAP_EXCLUDE) { - d_rank_t rank; - int failed_cnt; + int failed_cnt; rc = pool_map_update_failed_cnt(map); if (rc != 0) { @@ -6711,15 +6915,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, goto out_map; } - D_ASSERT(tgt_addrs->pta_number == 1); - rank = tgt_addrs->pta_addrs->pta_rank; failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE); - D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n", - DP_UUID(svc->ps_uuid), rank, failed_cnt); + D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt); if (failed_cnt > pw_rf) { - D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n", - DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt); - rc = pool_map_crit_prompt(svc, map, rank); + D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt); + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map); + ABT_rwlock_unlock(svc->ps_pool->sp_lock); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to log prompt", + DP_UUID(svc->ps_uuid)); + rc = -DER_RF; goto out_map; } } @@ -6768,6 +6976,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, ds_rsvc_request_map_dist(&svc->ps_rsvc); + /* See events_handler. */ + resume_event_handling(svc); + rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */); if (rc != 0) { DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration", @@ -7167,28 +7378,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc) } static int -pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank) +pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set) { struct pool_target_addr_list list; struct pool_target_addr_list inval_list_out = { 0 }; - struct pool_target_addr tgt_rank; + struct pool_target_addr *addrs; + d_rank_t self_rank = dss_self_rank(); uint32_t map_version = 0; + int n = 0; + int i; int rc; - tgt_rank.pta_rank = rank; - tgt_rank.pta_target = -1; - list.pta_number = 1; - list.pta_addrs = &tgt_rank; + D_ALLOC_ARRAY(addrs, event_set->pss_len); + if (addrs == NULL) + return -DER_NOMEM; + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_type != CRT_EVT_DEAD) + continue; + if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) { + D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n", + DP_UUID(svc->ps_uuid)); + continue; + } + addrs[n].pta_rank = event->psv_rank; + addrs[n].pta_target = -1; + n++; + } + if (n == 0) { + rc = 0; + goto out; + } + list.pta_number = n; + list.pta_addrs = addrs; rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &map_version, NULL /* hint */, MUS_SWIM); - D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n", - DP_UUID(svc->ps_uuid), map_version, rank, rc); + D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n", + DP_UUID(svc->ps_uuid), map_version, n, rc); pool_target_addr_list_free(&inval_list_out); - +out: + D_FREE(addrs); return rc; } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 440ffe68f823..7cda958d2423 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters): "common": [ "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", + "CRT_EVENT_DELAY=1", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], From e0f58831ef4b7de01bc0abb4be85d0635a85f3cd Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Fri, 18 Oct 2024 06:48:22 -0700 Subject: [PATCH 06/26] DAOS-16720 cq: pin isort to v1.1.0 (#15338) (#15339) Pin isort to v1.1.0 to avoid suprprise changes and because v1.1.1 is not working for us. Signed-off-by: Dalton Bohning --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fce435168a3..67c57bd31545 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3' - - uses: isort/isort-action@master + - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5 # v1.1.0 with: requirementsFiles: "requirements.txt" - name: Run on SConstruct file. From cb9d278be5b6730f294d65b74db17e4a948a9a23 Mon Sep 17 00:00:00 2001 From: Ken Cain Date: Sat, 19 Oct 2024 02:53:37 -0400 Subject: [PATCH 07/26] DAOS-15852 test: more timing samples for co_op_dup_timing() (#14497) (#15324) Rarely, this test will produce timings that exceed the failure threshold. Local and PR/CI experiments have shown that increasing the test's NUM_OPS to more than 200 iterations greatly reduces or may eliminate such intermittent timing failures, by "spreading out" the magnitude of the time spent in the 3 main loops of the test (0% loops perform fault injections, 33%, and 50%). Signed-off-by: Kenneth Cain --- src/tests/suite/daos_container.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c index 958f9fd1d165..243a829c3176 100644 --- a/src/tests/suite/daos_container.c +++ b/src/tests/suite/daos_container.c @@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state) size_t const in_sizes[] = {strlen(in_values[0]), strlen(in_values[1])}; int n = (int)ARRAY_SIZE(names); const unsigned int NUM_FP = 3; - const uint32_t NUM_OPS = 200; + const uint32_t NUM_OPS = 500; uint32_t num_failures = 0; const uint32_t SVC_OPS_ENABLED = 1; const uint32_t SVC_OPS_ENTRY_AGE = 60; From f8682fb56bc0d7928eb807f45599a5d9e9ada5de Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Sun, 20 Oct 2024 21:55:19 +0800 Subject: [PATCH 08/26] DAOS-16572 rebuild: properly assign global_dtx_resync_version in IV - b26 (#15186) In rebuild_iv_ent_refresh() for refreshing DTX resync version, needs to assign rt_global_dtx_resync_version firstly before wakeup related rebuild_scan_leader. Signed-off-by: Fan Yong --- src/rebuild/rebuild_iv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c index cc585e037e13..f4b9f75f4078 100644 --- a/src/rebuild/rebuild_iv.c +++ b/src/rebuild/rebuild_iv.c @@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, struct rebuild_tgt_pool_tracker *rpt; struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf; struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf; + uint32_t old_ver; int rc = 0; rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver, @@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, dst_iv->riv_stable_epoch); rpt->rt_global_done = dst_iv->riv_global_done; rpt->rt_global_scan_done = dst_iv->riv_global_scan_done; - if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver && + old_ver = rpt->rt_global_dtx_resync_version; + if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version) + rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; + if (old_ver < rpt->rt_rebuild_ver && dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) { D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n", - DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version, + DP_UUID(src_iv->riv_pool_uuid), old_ver, dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); ABT_mutex_unlock(rpt->rt_lock); } - rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; } out: From 81e57d0ec818c1f117afa97418d1f126bc528952 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Mon, 21 Oct 2024 10:10:03 -0600 Subject: [PATCH 09/26] DAOS-16716 ci: Set reference build for PRs (#15337) Release branch PRs should use the release branch build instead of master branch build for NLT reference Signed-off-by: Jeff Olivier --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 63c2dc28ae13..13ccc5128950 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -801,6 +801,7 @@ pipeline { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', always_script: 'ci/unit/test_nlt_post.sh', + referenceJobName: 'daos-stack/daos/release%252F2.6', valgrind_stash: 'el8-gcc-nlt-memcheck' recordIssues enabledForFailure: true, failOnError: false, From c8213799c4c14ad23ab6ef87c386b5e1d16ec089 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Tue, 22 Oct 2024 02:14:44 +0800 Subject: [PATCH 10/26] DAOS-16329 chk: maintenance mode after checking pool with dryrun - b26 (#14985) Sometimes, after system shutdown unexpectedly, the users may expect to check their critical data under some kind of maintenance mode. Under such mode, no user data can be modified or moved or aggregated. That will guarantee no further potential (DAOS logic caused) damage can happen during the check. For such purpose, we will enhance current DAOS CR logic with --dryrun option to allow the pool (after check) to be opened as immutable with disabling some mechanism that may potentially cause data modification or movement (such as rebuild or aggregation). Under such mode, if client wants to connect to the pool, the read-only option must be specified. Similarly for opening container in such pool. Signed-off-by: Fan Yong --- src/chk/chk_common.c | 2 +- src/chk/chk_engine.c | 13 +-- src/chk/chk_internal.h | 1 + src/client/dfs/cont.c | 4 +- src/container/srv_container.c | 32 +++++-- src/container/srv_target.c | 10 ++- src/include/daos/container.h | 4 +- src/include/daos_cont.h | 3 + src/include/daos_srv/pool.h | 7 +- src/include/daos_srv/security.h | 15 +++- src/mgmt/srv_target.c | 2 +- src/pool/srv_internal.h | 6 ++ src/pool/srv_pool.c | 82 +++++++++++++----- src/pool/srv_target.c | 16 ++-- src/rebuild/srv.c | 2 +- src/security/srv_acl.c | 8 +- src/tests/suite/daos_cr.c | 144 ++++++++++++++++++++++++++++++++ 17 files changed, 297 insertions(+), 54 deletions(-) diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index ace1c736791d..39821ec63287 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr) if (cpr->cpr_started) chk_pool_shutdown(cpr, true); - rc = ds_pool_start_after_check(cpr->cpr_uuid); + rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable); if (rc != 0) { D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n", DP_UUID(cpr->cpr_uuid), DP_RC(rc)); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 56e6da3ad9b9..ad61af851ce5 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args) } rc = chk_engine_cont_cleanup(cpr, svc, &aggregator); - if (rc != 0) - goto out; - - rc = ds_pool_svc_schedule_reconf(svc); + if (rc == 0 && !cpr->cpr_immutable) + rc = ds_pool_svc_schedule_reconf(svc); out: chk_engine_cont_list_fini(&aggregator); @@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins) if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE) continue; + if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN) + cpr->cpr_immutable = 1; + else + cpr->cpr_immutable = 0; + if (phase > pool_cbk->cb_phase) phase = pool_cbk->cb_phase; @@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) cbk = &cpr->cpr_bk; chk_pool_get(cpr); - rc = ds_pool_start(uuid, false); + rc = ds_pool_start(uuid, false, cpr->cpr_immutable); if (rc != 0) D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc)); diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 86868e305ee9..9ab16b060b39 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -596,6 +596,7 @@ struct chk_pool_rec { cpr_stop:1, cpr_done:1, cpr_skip:1, + cpr_immutable:1, cpr_dangling:1, cpr_for_orphan:1, cpr_notified_exit:1, diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c index 910e1819aa32..278229140564 100644 --- a/src/client/dfs/cont.c +++ b/src/client/dfs/cont.c @@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * out_snap: D_FREE(oit_args); epr.epr_hi = epr.epr_lo = snap_epoch; - rc2 = daos_cont_destroy_snap(coh, epr, NULL); + rc2 = daos_cont_destroy_snap(coh, epr, NULL); + if (rc2 != 0) + D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2)); if (rc == 0) rc = daos_der2errno(rc2); out_dfs: diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 372da43afe44..91d87d9b9781 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, * - Users who can delete any container in the pool * - Users who have been given access to delete the specific container */ - if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && - !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, - &owner, acl)) { + if (pool_hdl->sph_pool->sp_immutable || + (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && + !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) { D_ERROR(DF_CONT": permission denied to delete cont\n", DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid)); D_GOTO(out_prop, rc = -DER_NO_PERM); @@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } + if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable " + "container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc)); + goto out; + } + /* * Need props to check for pool redundancy requirements and access * control. @@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr D_GOTO(out, rc); } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags " + DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas); + if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) { D_ERROR(DF_CONT": permission denied evicting all handles\n", DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); @@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } - if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) { - D_ERROR(DF_CONT": permission denied opening exclusively\n", - DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); - daos_prop_free(prop); + if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) || + ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) { rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container " + "with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas, + DP_RC(rc)); + daos_prop_free(prop); goto out; } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b0f3b693580a..a6d3d3c5286a 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -935,7 +935,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid, cont_child->sc_stopping, cont_child->sc_destroying); rc = -DER_SHUTDOWN; } else if (!cont_child_started(cont_child)) { - if (!ds_pool_skip_for_check(pool_child->spc_pool)) { + if (!ds_pool_restricted(pool_child->spc_pool, false)) { rc = cont_start_agg(cont_child); if (rc != 0) goto out; @@ -1591,11 +1591,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, * but for creating rebuild global container handle. */ D_ASSERT(hdl->sch_cont != NULL); + D_ASSERT(hdl->sch_cont->sc_pool != NULL); hdl->sch_cont->sc_open++; if (hdl->sch_cont->sc_open > 1) goto opened; + if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false)) + goto csum_init; + rc = dtx_cont_open(hdl->sch_cont); if (rc != 0) { D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont " @@ -1623,10 +1627,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, D_GOTO(err_dtx, rc); } - D_ASSERT(hdl->sch_cont != NULL); - D_ASSERT(hdl->sch_cont->sc_pool != NULL); +csum_init: rc = ds_cont_csummer_init(hdl->sch_cont); - if (rc != 0) D_GOTO(err_dtx, rc); } diff --git a/src/include/daos/container.h b/src/include/daos/container.h index 282dfb8ec494..57ef06f63a24 100644 --- a/src/include/daos/container.h +++ b/src/include/daos/container.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags) f = flags; /* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */ - m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX); + m = f & DAOS_COO_IO_BASE_MASK; if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX) return false; diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h index c3bda9837d8b..632eafe81303 100644 --- a/src/include/daos_cont.h +++ b/src/include/daos_cont.h @@ -79,6 +79,9 @@ extern "C" { /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */ #define DAOS_COO_MASK ((1U << DAOS_COO_NBITS) - 1) +/** The basic IO mode: read-only, read-write or exclusively read-write. */ +#define DAOS_COO_IO_BASE_MASK (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX) + /** Maximum length for container hints */ #define DAOS_CONT_HINT_MAX_LEN 128 diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 6cbe3873f0ad..de22d55ed5d6 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -84,6 +84,7 @@ struct ds_pool { uuid_t sp_srv_pool_hdl; uint32_t sp_stopping:1, sp_cr_checked:1, + sp_immutable:1, sp_fetch_hdls:1, sp_need_discard:1, sp_disable_rebuild:1; @@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, unsigned int map_version); -bool ds_pool_skip_for_check(struct ds_pool *pool); -int ds_pool_start_after_check(uuid_t uuid); -int ds_pool_start(uuid_t uuid, bool aft_chk); +bool ds_pool_restricted(struct ds_pool *pool, bool immutable); +int ds_pool_start_after_check(uuid_t uuid, bool immutable); +int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable); int ds_pool_stop(uuid_t uuid); int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets, const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains); diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h index 203b9b548ee3..28fe83852b33 100644 --- a/src/include/daos_srv/security.h +++ b/src/include/daos_srv/security.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas); bool ds_sec_cont_can_evict_all(uint64_t cont_capas); +/** + * Determine if the container can be modified based on the container security + * capabilities. + * + * \param[in] cont_capas Capability bits acquired via + * ds_sec_cont_get_capabilities + * + * \return True Access allowed + * False Access denied + */ +bool +ds_sec_cont_can_modify(uint64_t cont_capas); + /** * Get the security capabilities for a rebuild container handle created by the * DAOS server. diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index e1d6ee79bce8..7975a2115d4e 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) tc_out->tc_ranks.ca_arrays = rank; tc_out->tc_ranks.ca_count = 1; - rc = ds_pool_start(tc_in->tc_pool_uuid, false); + rc = ds_pool_start(tc_in->tc_pool_uuid, false, false); if (rc) { D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n", DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 8f864c8c11a9..612f7760fd14 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -57,6 +57,12 @@ pool_tls_get() return tls; } +static inline bool +ds_pool_skip_for_check(struct ds_pool *pool) +{ + return engine_in_check() && !pool->sp_cr_checked; +} + struct pool_iv_map { d_rank_t piv_master_rank; uint32_t piv_pool_map_ver; diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 2d80088d4b6b..6e3a01379fa7 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1675,7 +1675,7 @@ init_events(struct pool_svc *svc) D_ASSERT(!events->pse_stop); D_ASSERT(!events->pse_paused); - if (!ds_pool_skip_for_check(svc->ps_pool)) { + if (!ds_pool_restricted(svc->ps_pool, false)) { rc = crt_register_event_cb(ds_pool_crt_event_cb, svc); if (rc != 0) { D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n", @@ -1706,7 +1706,7 @@ init_events(struct pool_svc *svc) return 0; err_cb: - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); if (events->pse_pending != NULL) free_event_set(&events->pse_pending); @@ -1721,7 +1721,7 @@ fini_events(struct pool_svc *svc) D_ASSERT(events->pse_handler != ABT_THREAD_NULL); - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); ABT_mutex_lock(events->pse_mutex); @@ -2565,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid) return 0; } +struct pool_start_args { + bool psa_aft_chk; + bool psa_immutable; +}; + /* * Try to start the pool. Continue the iteration upon errors as other pools may * still be able to work. @@ -2572,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid) static int start_one(uuid_t uuid, void *varg) { - int rc; + struct pool_start_args *psa = varg; + bool aft_chk; + bool immutable; + int rc; + + if (psa != NULL) { + aft_chk = psa->psa_aft_chk; + immutable = psa->psa_immutable; + } else { + aft_chk = false; + immutable = false; + } - D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid)); + D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); - rc = ds_pool_start(uuid, varg != NULL ? true : false); + rc = ds_pool_start(uuid, aft_chk, immutable); if (rc != 0) { - DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid)); + DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); ds_pool_failed_add(uuid, rc); } @@ -2597,12 +2615,27 @@ pool_start_all(void *arg) DP_RC(rc)); } +bool +ds_pool_restricted(struct ds_pool *pool, bool immutable) +{ + if (ds_pool_skip_for_check(pool)) + return true; + + if (pool->sp_immutable && !immutable) + return true; + + return false; +} + int -ds_pool_start_after_check(uuid_t uuid) +ds_pool_start_after_check(uuid_t uuid, bool immutable) { - bool aft_chk = true; + struct pool_start_args psa; - return start_one(uuid, &aft_chk); + psa.psa_aft_chk = true; + psa.psa_immutable = immutable; + + return start_one(uuid, &psa); } /* Note that this function is currently called from the main xstream. */ @@ -3820,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) &ds_pool_prop_connectable, &value); if (rc != 0) goto out_lock; - D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable); if (!connectable) { D_ERROR(DF_UUID": being destroyed, not accepting connections\n", DP_UUID(in->pci_op.pi_uuid)); @@ -3829,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) /* * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable() - * the PS restart for full service. If some client tries to connect the pool during + * and PS restart with full service. If some client tries to connect the pool during * such internal, it will get -DER_BUSY temporarily. */ if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) { - D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid)); - D_GOTO(out_lock, rc = -DER_BUSY); + rc = -DER_BUSY; + D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), DP_RC(rc)); + goto out_lock; + } + + if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc)); + goto out_lock; } /* Check existing pool handles. */ @@ -3997,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) } } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags " + DF_X64", sec_capas " DF_X64 "\n", + DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl), + svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas); + rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver, obj_layout_ver); if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) { @@ -6522,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid)); - if (ds_pool_skip_for_check(svc->ps_pool)) { + if (ds_pool_restricted(svc->ps_pool, false)) { D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid)); return -DER_OP_CANCELED; } @@ -8761,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool) ds_rsvc_put(rsvc); return rc; } - -bool -ds_pool_skip_for_check(struct ds_pool *pool) -{ - return engine_in_check() && !pool->sp_cr_checked; -} diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index cfa837e8b2a8..113fb757fd90 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate) goto done; } - if (!ds_pool_skip_for_check(child->spc_pool)) { + if (!ds_pool_restricted(child->spc_pool, false)) { rc = start_gc_ult(child); if (rc != 0) goto out_close; @@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool) * till ds_pool_stop. Only for mgmt and pool modules. */ int -ds_pool_start(uuid_t uuid, bool aft_chk) +ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable) { struct ds_pool *pool; struct daos_llink *llink; @@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk) else pool->sp_cr_checked = 0; + if (immutable) + pool->sp_immutable = 1; + else + pool->sp_immutable = 0; + rc = pool_child_add_all(pool); if (rc != 0) goto failure_pool; @@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk) } pool->sp_fetch_hdls = 1; + } + if (!ds_pool_restricted(pool, false)) { rc = ds_pool_start_ec_eph_query_ult(pool); if (rc != 0) { D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n", @@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, DP_UUID(pool->sp_uuid), map_version_before, map_version); } - if (map_updated) { + if (map_updated && !ds_pool_restricted(pool, false)) { struct dtx_scan_args *arg; int ret; - if (ds_pool_skip_for_check(pool)) - D_GOTO(out, rc = 0); - D_ALLOC_PTR(arg); if (arg == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index b1f722b8254f..5c463af9e903 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, return 0; } - if (ds_pool_skip_for_check(pool)) { + if (ds_pool_restricted(pool, false)) { D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n", DP_UUID(pool->sp_uuid)); return 0; diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c index ad3c0107e3b7..8f05216e9715 100644 --- a/src/security/srv_acl.c +++ b/src/security/srv_acl.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas) return (cont_capas & CONT_CAPA_EVICT_ALL) != 0; } +bool +ds_sec_cont_can_modify(uint64_t cont_capas) +{ + return (cont_capas & CONT_CAPAS_W_MASK) != 0; +} + uint64_t ds_sec_get_rebuild_cont_capabilities(void) { diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 3827a0eda6a7..4c981f296452 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -18,6 +18,8 @@ #include +#include "daos_iotest.h" + /* * Will enable accurate query result verification after DAOS-13520 resolved. * #define CR_ACCURATE_QUERY_RESULT 1 @@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state) cr_cleanup(arg, &pool, 1); } +#define CR_IO_SIZE 16 + +/* + * 1. Create pool and container without inconsistency. + * 2. Write something to the container. + * 3. Start checker with --dry-run option. + * 4. Query checker, it should be completed without any inconsistency reported. + * 5. Verify the pool only can be connected with read-only permission. + * 6. Verify the container only can be opened with read-only permission. + * 7. Verify the object only can be opened with read-only permission. + * 8. Verify the object cannot be written. + * 9. Read former wrote data and verify its correctness. + * 10. Switch to normal mode and cleanup. + */ +static void +cr_maintenance_mode(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = { 0 }; + struct test_cont cont = { 0 }; + daos_handle_t coh; + daos_obj_id_t oid; + struct ioreq req; + const char *dkey = "cr_dkey"; + const char *akey = "cr_akey"; + char update_buf[CR_IO_SIZE]; + char fetch_buf[CR_IO_SIZE]; + struct daos_check_info dci = { 0 }; + int rc; + + print_message("CR28: maintenance mode after dry-run check\n"); + + rc = cr_pool_create(state, &pool, true, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_cont_create(state, &pool, &cont, 0); + assert_rc_equal(rc, 0); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + + oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank); + arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS; + arg->async = 0; + ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg); + dts_buf_render(update_buf, CR_IO_SIZE); + + print_message("Generate some data.\n"); + + insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + rc = daos_obj_close(req.oh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the pool only can be connected as RDONLY mode.\n"); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the container only can be opened as RDONLY mode.\n"); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + +#if 0 + /* + * NOTE: Currently, DAOS security check is server-side logic, but object open may not + * talk with server. So related permission check will not be done until read or + * write really happened. If not consider exclusively open and cache case, then + * it seems fine; otherwise, need some enhancement in future. + */ + + print_message("Verify the object only can be opened as RDONLY mode.\n"); + + rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL); + assert_rc_equal(rc, -DER_NO_PERM); +#endif + + rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the object cannot be modified.\n"); + + req.arg->expect_result = -DER_NO_PERM; + insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + print_message("Verify former wrote data.\n"); + + req.arg->expect_result = 0; + lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE); + assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool1, async_disable, test_case_teardown}, { "CR27: handle the pool if some engine failed to report some pool service", cr_handle_fail_pool2, async_disable, test_case_teardown}, + { "CR28: maintenance mode after dry-run check", + cr_maintenance_mode, async_disable, test_case_teardown}, }; static int From b913d3e9685fe8b31b046d197f66477460053e68 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 21 Oct 2024 14:19:00 -0400 Subject: [PATCH 11/26] DAOS-16265 test: Fix erasurecode/rebuild_fio.py out of space (#15020) (#15340) Prevent accumulating large server log files caused by temporarily enabling the DEBUG log mask while creating or destroying pools. Signed-off-by: Phil Henderson --- .../ftest/erasurecode/multiple_failure.yaml | 1 + src/tests/ftest/erasurecode/rebuild_fio.yaml | 1 + src/tests/ftest/util/apricot/apricot/test.py | 22 +++++++++++++++---- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index 78f132474b5b..95aab5413291 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,6 +25,7 @@ server_config: storage: auto pool: size: 93% + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml index a895c3567072..a3539d865792 100644 --- a/src/tests/ftest/erasurecode/rebuild_fio.yaml +++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml @@ -39,6 +39,7 @@ pool: aggregation: threshold: 50000000 aggr_timeout: 180 + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 42e05937f37b..563ff7adecee 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs): self.setup_start_agents = True self.slurm_exclude_servers = False self.slurm_exclude_nodes = NodeSet() + self.max_test_dir_usage_check = 90 self.host_info = HostInfo() self.hostlist_servers = NodeSet() self.hostlist_clients = NodeSet() @@ -693,6 +694,11 @@ def setUp(self): self.slurm_exclude_servers = self.params.get( "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers) + # Max test directory usage percentage - when exceeded will display sizes of files in the + # test directory + self.max_test_dir_usage_check = self.params.get( + "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check) + # The server config name should be obtained from each ServerManager # object, but some tests still use this TestWithServers attribute. self.server_group = self.params.get("name", "/run/server_config/*", "daos_server") @@ -765,12 +771,20 @@ def setUp(self): # List common test directory contents before running the test self.log.info("-" * 100) - self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir)) + self.log.debug( + "Common test directory (%s) contents (check > %s%%):", + os.path.dirname(self.test_dir), self.max_test_dir_usage_check) all_hosts = include_local_host(self.host_info.all_hosts) test_dir_parent = os.path.dirname(self.test_dir) - result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") - if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90: - run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*") + _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") + _details = NodeSet() + for _host, _stdout in _result.all_stdout.items(): + _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout) + _test_dir_usage_int = int(max(_test_dir_usage + ["0"])) + if _test_dir_usage_int > self.max_test_dir_usage_check: + _details.add(_host) + if _details: + run_remote(self.log, _details, f"du -sh {test_dir_parent}/*") self.log.info("-" * 100) if not self.start_servers_once or self.name.uid == 1: From ffa1c9d952fb62f72524abec119e7d829bce5633 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Tue, 22 Oct 2024 11:38:08 -0400 Subject: [PATCH 12/26] DAOS-16693 telemetry: Avoid race between init/read (#15306) (#15322) In rare cases, a reader may attempt to access a telemetry node after it has been added to the tree, but before it has been fully initialized. Use an atomic to prevent reads before the initialization has completed. Unlucky readers will get a -DER_AGAIN instead of crashing. Signed-off-by: Michael MacDonald --- src/control/lib/telemetry/counter.go | 20 ++++++++----- src/control/lib/telemetry/duration.go | 20 ++++++++----- src/control/lib/telemetry/gauge.go | 38 ++++++++++++++---------- src/control/lib/telemetry/snapshot.go | 20 ++++++++----- src/control/lib/telemetry/telemetry.go | 12 ++++++++ src/control/lib/telemetry/timestamp.go | 22 +++++++++----- src/gurt/telemetry.c | 41 ++++++++++++++++++++++++++ src/include/gurt/telemetry_common.h | 3 ++ 8 files changed, 129 insertions(+), 47 deletions(-) diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go index 81549a32daf5..e6e59a8d0ea0 100644 --- a/src/control/lib/telemetry/counter.go +++ b/src/control/lib/telemetry/counter.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 { } func (c *Counter) Value() uint64 { + ctrVal := BadUintVal if c.handle == nil || c.node == nil { - return BadUintVal + return ctrVal } - var val C.uint64_t - - res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) + if res == C.DER_SUCCESS { + ctrVal = uint64(val) + } + return res } + c.fetchValWithRetry(fetch) - return BadUintVal + return ctrVal } func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter { diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go index 1f32125bc90a..3cfd240bde76 100644 --- a/src/control/lib/telemetry/duration.go +++ b/src/control/lib/telemetry/duration.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType { } func (d *Duration) Value() time.Duration { + durValue := BadDuration if d.handle == nil || d.node == nil { - return BadDuration + return durValue } - var tms C.struct_timespec - - res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) - if res == C.DER_SUCCESS { - return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) + if res == C.DER_SUCCESS { + durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + } + return res } + d.fetchValWithRetry(fetch) - return BadDuration + return durValue } func (d *Duration) FloatValue() float64 { diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go index ea84ff905047..93db24ab9fc9 100644 --- a/src/control/lib/telemetry/gauge.go +++ b/src/control/lib/telemetry/gauge.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 { // Value returns the value as an unsigned integer. func (g *Gauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge { @@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 { // Value returns the gauge value as an unsigned integer. func (g *StatsGauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge { diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go index 2ffa23296c33..5b2af9f07479 100644 --- a/src/control/lib/telemetry/snapshot.go +++ b/src/control/lib/telemetry/snapshot.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType { } func (s *Snapshot) Value() time.Time { + timeVal := time.Time{} // zero val if s.handle == nil || s.node == nil { - return time.Time{} + return timeVal } - var tms C.struct_timespec - - res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + } + return res } + s.fetchValWithRetry(fetch) - return time.Time{} + return timeVal } func (s *Snapshot) FloatValue() float64 { diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go index bb0593240b66..479c41e2aab2 100644 --- a/src/control/lib/telemetry/telemetry.go +++ b/src/control/lib/telemetry/telemetry.go @@ -84,6 +84,8 @@ const ( BadDuration = time.Duration(BadIntVal) PathSep = filepath.Separator + + maxFetchRetries = 1 ) type ( @@ -304,6 +306,16 @@ func (mb *metricBase) String() string { return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})])) } +func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int { + var rc C.int + for i := 0; i < maxFetchRetries; i++ { + if rc = fetchFn(); rc == C.DER_SUCCESS { + return rc + } + } + return rc +} + func (sm *statsMetric) Min() uint64 { return uint64(sm.stats.dtm_min) } diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go index 97ef5bb1ed95..c787aed488df 100644 --- a/src/control/lib/telemetry/timestamp.go +++ b/src/control/lib/telemetry/timestamp.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType { } func (t *Timestamp) Value() time.Time { - zero := time.Time{} + timeVal := time.Time{} // zero val if t.handle == nil || t.node == nil { - return zero + return timeVal } - var clk C.time_t - res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(clk), 0) + + fetch := func() C.int { + var clk C.time_t + res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(clk), 0) + } + return res } - return zero + t.fetchValWithRetry(fetch) + + return timeVal } // FloatValue converts the timestamp to time in seconds since the UNIX epoch. diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c index 8f08192d0720..be7762184a93 100644 --- a/src/gurt/telemetry.c +++ b/src/gurt/telemetry.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode, goto out; tmp->dtn_metric = NULL; tmp->dtn_sibling = NULL; + atomic_store_relaxed(&tmp->dtn_readable, false); *newnode = node; out: @@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, pthread_mutexattr_destroy(&mattr); temp->dtn_protect = true; } + + atomic_store_relaxed(&temp->dtn_readable, true); + if (node != NULL) *node = temp; @@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...) return rc; } +static bool +node_is_readable(struct d_tm_node_t *node) +{ + if (node == NULL) + return false; + + return atomic_load_relaxed(&node->dtn_readable); +} + /** * Creates histogram counters for the given node. It calculates the * extents of each bucket and creates counters at the path specified that @@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx, if (ctx == NULL || histogram == NULL || node == NULL) return -DER_INVAL; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + rc = validate_node_ptr(ctx, node, &shmem); if (rc != 0) return rc; @@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket, if (rc != 0) return rc; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + if (!has_stats(node)) return -DER_OP_NOT_PERMITTED; @@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val, if (node->dtn_type != D_TM_COUNTER) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + /* "ctx == NULL" is server side fast version to read the counter. */ if (ctx == NULL) { metric_data = node->dtn_metric; @@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val, if (node->dtn_type != D_TM_TIMESTAMP) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo, if (node->dtn_type != D_TM_MEMINFO) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_DURATION)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data == NULL) return -DER_METRIC_NOT_FOUND; @@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val, if (!is_gauge(node)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { dtm_stats = conv_ptr(shmem, metric_data->dtm_stats); @@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units, if (node->dtn_type == D_TM_DIRECTORY) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h index 4e7d3b3a02da..2068c1c0cce9 100644 --- a/src/include/gurt/telemetry_common.h +++ b/src/include/gurt/telemetry_common.h @@ -8,6 +8,8 @@ #include +#include + #define D_TM_VERSION 1 #define D_TM_MAX_NAME_LEN 256 #define D_TM_MAX_DESC_LEN 128 @@ -236,6 +238,7 @@ struct d_tm_node_t { pthread_mutex_t dtn_lock; /** individual mutex */ struct d_tm_metric_t *dtn_metric; /** values */ bool dtn_protect; /** synchronized access */ + _Atomic bool dtn_readable; /** fully initialized and ready for reads */ }; struct d_tm_nodeList_t { From 42a0d35d815505ba0435e192933160b52e55c7c2 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Tue, 22 Oct 2024 09:10:51 -0700 Subject: [PATCH 13/26] DAOS-16696 cart: Fix rc in error path (#15313) (#15357) - Fix rc in error path during ivo_on_update failure Required-githooks: true Signed-off-by: Alexander A Oganezov --- src/cart/crt_iv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index 603e565ac1f1..bf8124a8a6a3 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info) return rc; send_error: - rc = crt_bulk_free(cb_info->buc_bulk_hdl); + /* send back whatever error got us here */ output->rc = rc; + rc = crt_bulk_free(cb_info->buc_bulk_hdl); + if (rc != 0) + DL_ERROR(rc, "crt_bulk_free() failed"); + iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value, cb_info->buc_user_priv); From 1ae3f29dcbebe9a4ae123302cc11b9dfac870c8f Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Wed, 23 Oct 2024 00:57:32 +0800 Subject: [PATCH 14/26] DAOS-16574 vos: shrink DTX table blob size - b26 (#15220) (#15221) Use 4KB blob for committed DTX table and 16KB for active DTX table. It is more efficient for lower allocator and reduce the possibility of space allocation failure when space pressure. Simplify vos_dtx_commit logic and code cleanup. Signed-off-by: Fan Yong --- src/vos/vos_dtx.c | 101 +++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 1c60f7815070..f1d69b54f4e4 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -15,11 +15,10 @@ #include "vos_layout.h" #include "vos_internal.h" -/* 128 KB per SCM blob */ -#define DTX_BLOB_SIZE (1 << 17) -/** Ensure 16-bit signed int is sufficient to store record index */ -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) < (1 << 15)); -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) < (1 << 15)); +/* 16 KB blob for each active DTX blob */ +#define DTX_ACT_BLOB_SIZE (1 << 14) +/* 4 KB for committed DTX blob */ +#define DTX_CMT_BLOB_SIZE (1 << 12) #define DTX_ACT_BLOB_MAGIC 0x14130a2b #define DTX_CMT_BLOB_MAGIC 0x2502191c @@ -767,7 +766,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, static int vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch, daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p, - struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal) + struct vos_dtx_act_ent **dae_p, bool *rm_cos) { struct vos_dtx_act_ent *dae = NULL; struct vos_dtx_cmt_ent *dce = NULL; @@ -866,10 +865,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t goto out; rc = dtx_rec_release(cont, dae, false); - if (rc != 0) { - *fatal = true; + if (rc != 0) goto out; - } D_ASSERT(dae_p != NULL); *dae_p = dae; @@ -915,7 +912,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) umem_off_t dbd_off; int rc; - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space when create active DTX table.\n"); return -DER_NOSPACE; @@ -923,7 +920,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_ACT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_act_ent_df); dbd->dbd_count = 0; dbd->dbd_index = 0; @@ -1994,12 +1991,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], umem_off_t dbd_off; uint64_t cmt_time = daos_gettime_coarse(); int committed = 0; - int cur = 0; int rc = 0; - int rc1 = 0; + int p = 0; int i = 0; int j; - bool fatal = false; + int k; bool allocated = false; dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); @@ -2017,67 +2013,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto new_blob; again: - for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0; - i++, cur++) { - struct vos_dtx_cmt_ent *dce = NULL; - - rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce, - daes != NULL ? &daes[cur] : NULL, - rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal); - if (dces != NULL) - dces[cur] = dce; - - if (fatal) - goto out; - - if (rc == 0 && (daes == NULL || daes[cur] != NULL)) + for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) { + rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i], + daes != NULL ? &daes[i] : NULL, + rm_cos != NULL ? &rm_cos[i] : NULL); + if (rc == 0 && (daes == NULL || daes[i] != NULL)) committed++; if (rc == -DER_ALREADY || rc == -DER_NONEXIST) rc = 0; - if (rc1 == 0) - rc1 = rc; + if (rc != 0) + goto out; + + if (dces[i] != NULL) + j++; + } - if (dce != NULL) { - rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j], - sizeof(struct vos_dtx_cmt_ent_df), - UMEM_XADD_NO_SNAPSHOT); + if (j > dbd->dbd_count) { + if (!allocated) { + rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count], + sizeof(struct vos_dtx_cmt_ent_df) * + (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; - memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base, + /* Only need to add range for the first partial blob. */ + rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); + if (rc != 0) + goto out; + } + + for (k = dbd->dbd_count; k < j; k++, p++) { + while (dces[p] == NULL) + p++; + + memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base, sizeof(struct vos_dtx_cmt_ent_df)); } - } - if (!allocated) { - /* Only need to add range for the first partial blob. */ - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, - sizeof(dbd->dbd_count)); - if (rc != 0) - D_GOTO(out, fatal = true); + dbd->dbd_count = j; } - dbd->dbd_count = j; - - if (i == count || rc1 != 0) + if (i == count) goto out; new_blob: dbd_prev = dbd; /* Need new @dbd */ - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space to store committed DTX %d "DF_DTI"\n", - count, DP_DTI(&dtis[cur])); - fatal = true; + count, DP_DTI(&dtis[i])); D_GOTO(out, rc = -DER_NOSPACE); } dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_cmt_ent_df); dbd->dbd_prev = umem_ptr2off(umm, dbd_prev); @@ -2090,21 +2083,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], sizeof(cont_df->cd_dtx_committed_head) + sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; cont_df->cd_dtx_committed_head = dbd_off; } else { rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next, sizeof(dbd_prev->dbd_next)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; dbd_prev->dbd_next = dbd_off; rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail, sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; } D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n", @@ -2115,7 +2108,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto again; out: - return fatal ? rc : (committed > 0 ? committed : rc1); + return rc < 0 ? rc : committed; } void From dcf84195f80f2d0367fd03264b76d7a9674fdd19 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Wed, 23 Oct 2024 23:20:15 +0900 Subject: [PATCH 15/26] DAOS-16653 doc: Fix CRT_EVENT_DELAY description (#15351) (#15371) Fix the description of the CRT_EVENT_DELAY environment variable in docs/admin/env_variables.md. Signed-off-by: Li Wei --- docs/admin/env_variables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 31e0f3e87672..3de0a0792030 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side. |DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.| |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.| |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.| -|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| +|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.| |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"| |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.| From 4c49f3626726e561dfab41c21a9345573828abc3 Mon Sep 17 00:00:00 2001 From: Ken Cain Date: Wed, 23 Oct 2024 16:54:02 -0400 Subject: [PATCH 16/26] DAOS-16650 control: dmg system exclude, update group version (#15288) (#15349) With this change, when a daos administrator runs dmg system exclude for a given set of engines, the system map version / cart primary group version will be updated. In turn, daos_engines will more immediately detect the "loss" of the administratively excluded engines, update pool maps and perform rebuild. This change supports a use case of a proactive exclusion of ranks that are expected to be impacted by planned maintenance that would cut off connectivity to certain engines. Signed-off-by: Kenneth Cain --- src/control/server/mgmt_system.go | 2 ++ src/control/server/mgmt_system_test.go | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index e85091b5529f..a35753d95ba1 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude }) } + svc.reqGroupUpdate(ctx, false) + return resp, nil } diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go index f482b1943a19..0cffe7f63d86 100644 --- a/src/control/server/mgmt_system_test.go +++ b/src/control/server/mgmt_system_test.go @@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { mockMember(t, 3, 2, "joined"), }, }, + "unexclude hosts": { req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true}, members: system.Members{ @@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { if tc.req != nil && tc.req.Sys == "" { tc.req.Sys = build.DefaultSystemName } + + startMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("startMapVer CurMapVersion() failed\n") + return + } gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req) test.CmpErr(t, tc.expAPIErr, gotAPIErr) if tc.expAPIErr != nil { return } + // Check for any system map version increase by the (asynchronous) update. + // Test will time out if it never happens, thus choice of an infinite loop here. + for { + curMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("CurMapVersion() failed\n") + return + } + + if curMapVer > startMapVer { + break + } + } + checkRankResults(t, tc.expResults, gotResp.Results) checkMembers(t, tc.expMembers, svc.membership) }) From 2819d4532e00b2395396ba3294e0bdc2c6a0449c Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Thu, 24 Oct 2024 15:04:10 +0800 Subject: [PATCH 17/26] DAOS-16488 chk: take sd_lock before accessing VOS sys_db - b26 (#15269) The VOS sys_db may have multuiple users, such as SMD and CHK. It is caller's duty to take lock against the VOS sys_db before accessing it to handle concurrent operations from multiple XS. Signed-off-by: Fan Yong --- src/chk/chk_vos.c | 67 ++++++++++++++++++++++++++--------------------- src/common/lru.c | 4 ++- src/vos/sys_db.c | 27 +++++++++++++++---- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c index fdefc2995f71..af2904affd13 100644 --- a/src/chk/chk_vos.c +++ b/src/chk/chk_vos.c @@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size) { d_iov_t key_iov; d_iov_t val_iov; + int rc; d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); - return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } static int @@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } @@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size) d_iov_t key_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } static int chk_db_traverse(sys_db_trav_cb_t cb, void *args) { - return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + int rc; + + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } int @@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); if (cpp->cp_rank_nr != 0 && rank_list != NULL) { D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n", @@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); if (rc != 0) - goto end; + goto out; } d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY)); @@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); -end: - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); - out: + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + if (rc != 0) D_ERROR("Failed to update check property on rank %u: "DF_RC"\n", dss_self_rank(), DP_RC(rc)); diff --git a/src/common/lru.c b/src/common/lru.c index de86d367e0ea..87dbdaddaa9a 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -255,7 +255,9 @@ void daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink) { D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1); - D_ASSERT(d_list_empty(&llink->ll_qlink)); + D_ASSERTF(d_list_empty(&llink->ll_qlink), + "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n", + lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next); lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link); diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c index e7b4a2baa208..d1f6d4bce98a 100644 --- a/src/vos/sys_db.c +++ b/src/vos/sys_db.c @@ -349,8 +349,9 @@ vos_db_init(const char *db_path) int vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini) { - int create; - int rc; + ABT_mutex_attr attr = ABT_MUTEX_ATTR_NULL; + int create; + int rc; D_ASSERT(db_path != NULL); @@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool goto failed; } - rc = ABT_mutex_create(&vos_db.db_lock); + rc = ABT_mutex_attr_create(&attr); if (rc != ABT_SUCCESS) { - rc = -DER_NOMEM; - goto failed; + D_ERROR("Failed to create mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to set mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); } + rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to create mutex with attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + ABT_mutex_attr_free(&attr); + vos_db.db_poh = DAOS_HDL_INVAL; vos_db.db_coh = DAOS_HDL_INVAL; @@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool } return 0; failed: + if (attr != ABT_MUTEX_ATTR_NULL) + ABT_mutex_attr_free(&attr); vos_db_fini(); return rc; } From ec3aa1c64d02d959289ec9417df96a8e945b5a0b Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Thu, 24 Oct 2024 20:34:28 +0800 Subject: [PATCH 18/26] DAOS-16469 dtx: optimize DTX CoS cache - b26 (#15085) If there are a lot of committable DTX entries in DTX CoS cache, then it may be inefficient to locate the DTX entry in CoS cache with given oid + dkey_hash, that may happen under the case of that DTX batched commit is blocked (such as because of network trouble) as to trigger DTX refresh (for DTX cleanup) on other related engines. If that happened, it will increase the system load on such engine and slow down DTX commit further more. The patch reduces unnecessary search operation inside CoS cache. Other changes: 1. Metrics (io/dtx/async_cmt_lat/tgt_id) for DTX asynchronously commit latency (with unit ms). 2. Fix a bug in sched_ult2xs() with multiple numa sockets for DSS_XS_OFFLOAD case. 3. Delay commit (or abort) collective DTX on the leader target to handle resent race. 4. Avoid blocking dtx_req_wait() if chore failed to send out some DTX RPC. 5. Some cleanup for error handling. Signed-off-by: Fan Yong --- src/container/srv_target.c | 1 + src/dtx/dtx_common.c | 43 ++-- src/dtx/dtx_cos.c | 308 ++++++++++++++++-------- src/dtx/dtx_internal.h | 4 + src/dtx/dtx_resync.c | 4 +- src/dtx/dtx_rpc.c | 104 +++++--- src/dtx/dtx_srv.c | 13 +- src/engine/ult.c | 2 +- src/include/daos_srv/container.h | 2 + src/include/daos_srv/dtx_srv.h | 9 +- src/tests/ftest/util/telemetry_utils.py | 6 +- 11 files changed, 322 insertions(+), 174 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index a6d3d3c5286a..35ac3d4285c5 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, cont->sc_dtx_committable_coll_count = 0; D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list); D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list); + D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list); *link = &cont->sc_list; return 0; diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ff4f2dfe4ef0..ecb156729ed6 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -392,11 +392,11 @@ dtx_cleanup(void *arg) if (rc == 0) { D_ASSERT(dce != NULL); - rc = dtx_coll_commit(cont, dce, NULL); + rc = dtx_coll_commit(cont, dce, NULL, false); dtx_coll_entry_put(dce); } } else { - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } } @@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg) tls->dt_batched_ult_cnt++; /* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */ - while (!dss_ult_exiting(dbca->dbca_commit_req) && + while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) && dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) { struct dtx_entry **dtes = NULL; - struct dtx_cos_key *dcks = NULL; struct dtx_coll_entry *dce = NULL; struct dtx_stat stat = { 0 }; int cnt; int rc; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL, - DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce); + DAOS_EPOCH_MAX, false, &dtes, NULL, &dce); if (cnt == 0) break; @@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg) /* Currently, commit collective DTX one by one. */ D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, NULL, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, NULL, cnt, true); } - dtx_free_committable(dtes, dcks, dce, cnt); + dtx_free_committable(dtes, NULL, dce, cnt); if (rc != 0) { D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n", cnt, DP_UUID(cont->sc_uuid), DP_RC(rc)); @@ -1271,7 +1270,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul uint32_t flags; int status = -1; int rc = 0; - int i; bool aborted = false; bool unpin = false; @@ -1424,10 +1422,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul vos_dtx_mark_committable(dth); if (dlh->dlh_coll) { - rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL); + rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false); } else { dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } if (rc != 0) @@ -1487,15 +1485,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul /* If piggyback DTX has been done everywhere, then need to handle CoS cache. * It is harmless to keep some partially committed DTX entries in CoS cache. */ - if (result == 0 && dth->dth_cos_done) { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_del(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } else { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } + dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos, + dth->dth_dti_cos_count, + (result == 0 && dth->dth_cos_done) ? true : false); D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n", DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash, @@ -1654,7 +1646,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, - NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce); + NULL, DAOS_EPOCH_MAX, true, &dtes, + dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce); if (cnt <= 0) D_GOTO(out, rc = cnt); @@ -1675,9 +1668,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); } @@ -2365,9 +2358,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); if (rc < 0) { diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 6e1d042b82b3..4c165f94d0c7 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -54,6 +54,8 @@ struct dtx_cos_rec_child { d_list_t dcrc_gl_committable; /* Link into related dcr_{reg,prio}_list. */ d_list_t dcrc_lo_link; + /* Link into container::sc_dtx_batched_list. */ + d_list_t dcrc_batched_link; union { struct dtx_entry *dcrc_dte; struct dtx_coll_entry *dcrc_dce; @@ -61,8 +63,12 @@ struct dtx_cos_rec_child { /* The DTX epoch. */ daos_epoch_t dcrc_epoch; struct dtx_cos_rec *dcrc_ptr; + uint64_t dcrc_ready_time; uint32_t dcrc_piggyback_refs; - uint32_t dcrc_coll:1; /* For collective DTX. */ + uint32_t dcrc_expcmt:1, + dcrc_prio:1, + dcrc_reg:1, + dcrc_coll:1; /* For collective DTX. */ }; struct dtx_cos_rec_bundle { @@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, return -DER_NOMEM; } + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count = 1; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count = 1; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count = 1; } @@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, if (dcrc == NULL) return -DER_NOMEM; + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count++; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count++; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count++; } @@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = { .to_rec_update = dtx_cos_rec_update, }; +static int +dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc) +{ + struct dtx_cos_key key; + d_iov_t kiov; + struct dtx_cos_rec *dcr = dcrc->dcrc_ptr; + uint64_t time = daos_getmtime_coarse() - dcrc->dcrc_ready_time; + int rc = 0; + + d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_lo_link); + if (!d_list_empty(&dcrc->dcrc_batched_link)) + d_list_del_init(&dcrc->dcrc_batched_link); + + if (dcrc->dcrc_expcmt) + dcr->dcr_expcmt_count--; + else if (dcrc->dcrc_prio) + dcr->dcr_prio_count--; + else + dcr->dcr_reg_count--; + + if (dcrc->dcrc_coll) + cont->sc_dtx_committable_coll_count--; + cont->sc_dtx_committable_count--; + + d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time); + + if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) { + key.oid = dcr->dcr_oid; + key.dkey_hash = dcr->dcr_dkey_hash; + d_iov_set(&kiov, &key, sizeof(key)); + rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); + } + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid)); + + if (dcrc->dcrc_coll) + dtx_coll_entry_put(dcrc->dcrc_dce); + else + dtx_entry_put(dcrc->dcrc_dte); + + D_FREE(dcrc); + + return rc; +} + int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, bool force, @@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, uint32_t count; uint32_t i = 0; + /* Last batched commit failed, let's re-commit them. */ + if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) { + dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child, + dcrc_batched_link); + if (unlikely(dcrc->dcrc_coll)) { + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + return 1; + } + + D_ALLOC_ARRAY(dte_buf, max_cnt); + if (dte_buf == NULL) + return -DER_NOMEM; + + d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) { + D_ASSERT(i < max_cnt); + dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte); + } + + *dtes = dte_buf; + return i; + } + /* Process collective DXT with higher priority. */ if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) { d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) { if (epoch >= dcrc->dcrc_epoch && (dcrc->dcrc_piggyback_refs == 0 || force)) { - D_ALLOC_PTR(dck_buf); - if (dck_buf == NULL) - return -DER_NOMEM; - - dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + D_ALLOC_PTR(dck_buf); + if (dck_buf == NULL) + return -DER_NOMEM; + + dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; @@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, } count = min(cont->sc_dtx_committable_count, max_cnt); - if (count == 0) { - *dtes = NULL; + if (count == 0) return 0; - } D_ALLOC_ARRAY(dte_buf, count); if (dte_buf == NULL) return -DER_NOMEM; - D_ALLOC_ARRAY(dck_buf, count); - if (dck_buf == NULL) { - D_FREE(dte_buf); - return -DER_NOMEM; + if (dcks != NULL) { + D_ALLOC_ARRAY(dck_buf, count); + if (dck_buf == NULL) { + D_FREE(dte_buf); + return -DER_NOMEM; + } } d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) { @@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, continue; D_FREE(dte_buf); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; } dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list); + } if (++i >= count) break; @@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, if (i == 0) { D_FREE(dte_buf); D_FREE(dck_buf); - *dtes = NULL; } else { *dtes = dte_buf; - *dcks = dck_buf; + if (dcks != NULL) + *dcks = dck_buf; } return i; @@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, } void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash) +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm) { struct dtx_cos_key key; d_iov_t kiov; d_iov_t riov; struct dtx_cos_rec *dcr; struct dtx_cos_rec_child *dcrc; + int del = 0; int rc; + int i; key.oid = *oid; key.dkey_hash = dkey_hash; d_iov_set(&kiov, &key, sizeof(key)); d_iov_set(&riov, NULL, 0); - /* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */ - rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov); if (rc == 0) { dcr = (struct dtx_cos_rec *)riov.iov_buf; - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - dcrc->dcrc_piggyback_refs--; - return; + for (i = 0; i < count; i++) { + d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], + sizeof(struct dtx_id)) == 0) { + if (rm) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } else { + dcrc->dcrc_piggyback_refs--; + } + break; + } } } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); } } @@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); if (flags & DCF_COLL) - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); else - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); @@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 1); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_prio_count--; - - D_GOTO(out, found = 1); } d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 2); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_reg_count--; - - D_GOTO(out, found = 2); } d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 3); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_expcmt_count--; - - D_GOTO(out, found = 3); } out: - if (found > 0) { + if (found > 0) d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1); - if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && - dcr->dcr_expcmt_count == 0) - rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); - } - if (rc == 0 && found == 0) rc = -DER_NONEXIST; - D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO, - "Remove DTX "DF_DTI" from CoS " - "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n", - DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash, - found == 1 ? "has" : "has not", DP_RC(rc)); + DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "Remove DTX from CoS cache "DF_UOID", key %lu", + DP_UOID(*oid), (unsigned long)dkey_hash); return rc == -DER_NONEXIST ? 0 : rc; } @@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont) return dcrc->dcrc_epoch; } +/* + * It is inefficient to search some item on a very long list. So let's skip + * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal. + */ +#define DTX_COS_SEARCH_MAX 32 + void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash) @@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; + if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX) + goto expcmt; + d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + dcrc->dcrc_reg = 0; + dcrc->dcrc_prio = 1; d_list_del(&dcrc->dcrc_lo_link); d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_reg_count--; @@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, } } - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - d_list_del(&dcrc->dcrc_lo_link); - d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); - - D_GOTO(out, found = true); - } - } +expcmt: + if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX) + goto out; d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) @@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, /* It is normal that the DTX entry (for priority) in CoS has been committed by race. */ } + +void +dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count) +{ + struct dtx_cos_rec_child *dcrc; + int del = 0; + int rc; + int i = 0; + bool found; + + while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child, + dcrc_batched_link)) != NULL) { + for (found = false; i < count && !found; i++) { + /* + * Some entries in the sc_dtx_batched_list may have been committed by + * others by race. Since the entries order in the sc_dtx_batched_list + * will not be changed, let's compare with xid[i] via one cycle scan. + */ + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) { + found = true; + + if (rm[i]) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } + } + } + + /* There must be one in xid array that matches current dcrc. */ + D_ASSERT(found); + } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); +} diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a42bcc1d7d68..06d0333dd77e 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -213,6 +213,7 @@ struct dtx_pool_metrics { struct dtx_tls { struct d_tm_node_t *dt_committable; struct d_tm_node_t *dt_dtx_leader_total; + struct d_tm_node_t *dt_async_cmt_lat; uint64_t dt_agg_gen; uint32_t dt_batched_ult_cnt; }; @@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont); void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash); +void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], + uint32_t count); + /* dtx_rpc.c */ int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index b98a6469954c..756dfe0ca440 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont, } if (j > 0) { - rc = dtx_commit(cont, dtes, dcks, j); + rc = dtx_commit(cont, dtes, dcks, j, true); if (rc < 0) D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n", DP_RC(rc)); @@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un dck.oid = oid; dck.dkey_hash = dkey_hash; - rc = dtx_coll_commit(cont, dce, &dck); + rc = dtx_coll_commit(cont, dce, &dck, true); } dtx_coll_entry_put(dce); diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index e654047a6218..5423f6c0cc88 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: - D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, - drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0); drr->drr_comp = 1; drr->drr_result = rc; @@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance) if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 && DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT))) - rc = dtx_req_send(dca->dca_drr, 1); + dtx_req_send(dca->dca_drr, 1); else - rc = dtx_req_send(dca->dca_drr, dca->dca_epoch); - if (rc != 0) { - /* If the first sub-RPC failed, then break, otherwise - * other remote replicas may have already received the - * RPC and executed it, so have to go ahead. - */ - if (dca->dca_i == 0) { - ABT_future_free(&dra->dra_future); - return rc; - } - } + dtx_req_send(dca->dca_drr, dca->dca_epoch); + /* + * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future. + * Each RPC is independent from the others, let's go head to handle the other RPCs + * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait. + */ /* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */ dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next, @@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance) rc = dtx_req_list_send(dca, is_reentrance); if (rc == DSS_CHORE_YIELD) return DSS_CHORE_YIELD; - if (rc == DSS_CHORE_DONE) - rc = 0; - if (rc != 0) - dca->dca_dra.dra_result = rc; - D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore, - dca->dca_dra.dra_opc, rc); + D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n", + dca->dca_dra.dra_opc, rc); if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) { rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc); @@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, } rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper); - if (rc != 0) + if (rc != 0) { + ABT_eventual_free(&dca->dca_chore_eventual); goto out; + } rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc); @@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, */ int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count) + struct dtx_cos_key *dcks, int count, bool has_cos) { struct dtx_common_args dca; struct dtx_req_args *dra = &dca.dca_dra; @@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count, DTE_PARTIAL_COMMITTED); } else { - if (dcks != NULL) { + if (has_cos) { if (count > 1) { D_ALLOC_ARRAY(rm_cos, count); if (rm_cos == NULL) @@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, } if (rc1 == 0 && rm_cos != NULL) { - for (i = 0; i < count; i++) { - if (rm_cos[i]) { - D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); - dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, - dcks[i].dkey_hash); + if (dcks != NULL) { + for (i = 0; i < count; i++) { + if (rm_cos[i]) { + D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); + dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, + dcks[i].dkey_hash); + } } + } else { + dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count); } } @@ -1237,7 +1235,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DTX_ST_COMMITTABLE: dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1258,7 +1256,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DSHR_NEED_COMMIT: { dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1571,17 +1569,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) { int rc; - rc = ABT_future_wait(dcra->dcra_future); - D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, - "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", - dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); - ABT_future_free(&dcra->dcra_future); + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } return ret != 0 ? ret : dcra->dcra_result; } int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck) +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos) { struct dtx_coll_rpc_args dcra = { 0 }; int *results = NULL; @@ -1591,11 +1592,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d int rc1 = 0; int rc2 = 0; int i; + bool cos = true; if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra); + /* + * NOTE: Before committing the DTX on remote participants, we cannot remove the active + * DTX locally; otherwise, the local committed DTX entry may be removed via DTX + * aggregation before remote participants commit done. Under such case, if some + * remote DTX participant triggere DTX_REFRESH for such DTX during the interval, + * then it will get -DER_TX_UNCERTAIN, that may cause related application to be + * failed. So here, we let remote participants to commit firstly, if failed, we + * will ask the leader to retry the commit until all participants got committed. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); @@ -1634,8 +1646,12 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d * to remove the collective DTX entry from the CoS even if the commit failed remotely. * Otherwise, the batched commit ULT may be blocked by such "bad" entry. */ - if (rc2 == 0 && dck != NULL) - dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + if (rc2 == 0 && has_cos) { + if (dck != NULL) + dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + else + dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1); + } D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, "Collectively commit DTX "DF_DTI": %d/%d/%d\n", @@ -1658,7 +1674,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra); + /* + * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX + * participant(s). Under such case, the client side RPC sponsor may also hit + * the RPC timeout and resends related RPC to the leader. Here, to avoid DTX + * abort and resend RPC forwarding being executed in parallel, we will abort + * local DTX after remote done, before that the logic of handling resent RPC + * on server will find the local pinned DTX entry then notify related client + * to resend sometime later. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 93cc6744a2df..9e78100c451b 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) tls->dt_agg_gen = 1; rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE, - "total number of committable DTX entries", - "entries", "io/dtx/committable/tgt_%u", tgt_id); + "total number of committable DTX entries", "entry", + "io/dtx/committable/tgt_%u", tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create DTX committable metric: " DF_RC"\n", DP_RC(rc)); @@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create DTX leader metric: " DF_RC"\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE, + "DTX async commit latency", "ms", + "io/dtx/async_cmt_lat/tgt_%u", tgt_id); + if (rc != DER_SUCCESS) + D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n", + DP_RC(rc)); + return tls; } @@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id) rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER, "total number of processed sync DTX_COMMIT", "ops", - "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id); + "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc)); diff --git a/src/engine/ult.c b/src/engine/ult.c index 94a2a0f93908..fbeb3f538fa8 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id) break; case DSS_XS_OFFLOAD: if (dss_numa_nr > 1) - xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); + return sched_ult2xs_multisocket(xs_type, tgt_id); if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr; diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 793d585c07a4..f99f4df14e39 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -134,6 +134,8 @@ struct ds_cont_child { d_list_t sc_dtx_cos_list; /* The global list for committable collective DTXs. */ d_list_t sc_dtx_coll_list; + /* The list for current DTX batched commit. */ + d_list_t sc_dtx_batched_list; /* the pool map version of updating DAOS_PROP_CO_STATUS prop */ uint32_t sc_status_pm_ver; }; diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7b3f7e9ee57c..7c60d2deaa03 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -318,8 +318,8 @@ int dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, int max, struct dtx_id **dtis); void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash); +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm); int dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg); @@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); + struct dtx_cos_key *dcks, int count, bool has_cos); int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck); +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos); int dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index db424b6de685..17f1753b1003 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -90,6 +90,7 @@ class TelemetryUtils(): "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", + "engine_pool_ops_dtx_sync_commit", "engine_pool_ops_ec_agg", "engine_pool_ops_ec_rep", "engine_pool_ops_fetch", @@ -200,6 +201,8 @@ class TelemetryUtils(): "engine_dmabuff_queued_reqs", "engine_dmabuff_grab_errs", *_gen_stats_metrics("engine_dmabuff_grab_retries")] + ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \ + _gen_stats_metrics("engine_io_dtx_async_cmt_lat") ENGINE_IO_DTX_COMMITTABLE_METRICS = \ _gen_stats_metrics("engine_io_dtx_committable") ENGINE_IO_DTX_COMMITTED_METRICS = \ @@ -304,7 +307,8 @@ class TelemetryUtils(): _gen_stats_metrics("engine_io_ops_tgt_update_active") ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \ _gen_stats_metrics("engine_io_ops_update_active") - ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\ + ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\ + ENGINE_IO_DTX_COMMITTABLE_METRICS +\ ENGINE_IO_DTX_COMMITTED_METRICS +\ ENGINE_IO_LATENCY_FETCH_METRICS +\ ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\ From 2b5620b230df6a7c3700f10128998cfb5360b2fc Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Thu, 24 Oct 2024 10:47:16 -0500 Subject: [PATCH 19/26] DAOS-14262 cart: add ability to select traffic class for SWIM context (#14893) (#14917) Add SWIM_TRAFFIC_CLASS env var (default is unspec) Signed-off-by: Jerome Soumagne --- src/cart/README.env | 12 +++++++----- src/cart/crt_hg.c | 3 +++ src/cart/crt_init.c | 36 ++++++++++++++++++++++++++--------- src/cart/crt_internal_types.h | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/cart/README.env b/src/cart/README.env index 00f270d7a413..b90939c8c720 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -1,13 +1,10 @@ This file lists the environment variables used in CaRT. . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR) - It determines which mercury NA plugin to be used: + It determines which mercury NA plugin and transport to be used: - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider - - set it as "ofi+gni" to use OFI gni provider - set it as "sm" to use SM plugin which only works within single node - - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider. - - set it as "ofi+sockets" to use OFI sockets provider - NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm" + - set it as "ofi+tcp" to use OFI tcp provider. - by default (not set or set as any other value) it will use ofi tcp provider. @@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT. start copying data in an effort to release multi-recv buffers. Copy will occur when at most D_MRECV_BUF_COPY buffers remain. + SWIM_TRAFFIC_CLASS + (server only) Select a traffic class for the SWIM protocol to use and prevent potential + traffic congestion. Available options are: "unspec" (default), "best_effort", + "low_latency", "bulk_data". + diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 789d75ceb31b..244373c9228e 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.request_post_incr = crt_gdata.cg_post_incr; init_info.multi_recv_op_max = crt_gdata.cg_mrecv_buf; init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy; + /* Separate SWIM traffic in an effort to prevent potential congestion. */ + if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx) + init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 21fc184d4469..48d2090a5b36 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -18,6 +18,10 @@ static volatile int gdata_init_flag; struct crt_plugin_gdata crt_plugin_gdata; static bool g_prov_settings_applied[CRT_PROV_COUNT]; +#define X(a, b) b, +static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; +#undef X + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -237,18 +241,30 @@ crt_gdata_dump(void) DUMP_GDATA_FIELD("%d", cg_rpc_quota); } +static enum crt_traffic_class +crt_str_to_tc(const char *str) +{ + enum crt_traffic_class i = 0; + + while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN) + i++; + + return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i; +} + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) { - uint32_t timeout = 0; - uint32_t credits; - uint32_t fi_univ_size = 0; - uint32_t mem_pin_enable = 0; - uint32_t is_secondary; - uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; - unsigned int mrecv_buf = CRT_HG_MRECV_BUF; - unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ - int rc = 0; + uint32_t timeout = 0; + uint32_t credits; + uint32_t fi_univ_size = 0; + uint32_t mem_pin_enable = 0; + uint32_t is_secondary; + uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; + unsigned int mrecv_buf = CRT_HG_MRECV_BUF; + unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ + char *swim_traffic_class = NULL; + int rc = 0; crt_env_dump(); @@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_mrecv_buf = mrecv_buf; crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy); crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy; + crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class); + crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class); is_secondary = 0; /* Apply CART-890 workaround for server side only */ diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 38133e0cf170..c3eb9cae12cb 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -42,6 +42,17 @@ struct crt_na_config { char **noc_domain_str; /* Array of domains */ }; +#define CRT_TRAFFIC_CLASSES \ + X(CRT_TC_UNSPEC, "unspec") /* Leave it upon plugin to choose */ \ + X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */ \ + X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */ \ + X(CRT_TC_BULK_DATA, "bulk_data") /* Bulk data */ \ + X(CRT_TC_UNKNOWN, "unknown") /* Unknown */ + +#define X(a, b) a, +enum crt_traffic_class { CRT_TRAFFIC_CLASSES }; +#undef X + struct crt_prov_gdata { /** NA plugin type */ int cpg_provider; @@ -105,6 +116,9 @@ struct crt_gdata { /** global swim index for all servers */ int32_t cg_swim_crt_idx; + /** traffic class used by SWIM */ + enum crt_traffic_class cg_swim_tc; + /** credits limitation for #in-flight RPCs per target EP CTX */ uint32_t cg_credit_ep_ctx; @@ -220,6 +234,7 @@ struct crt_event_cb_priv { ENV(SWIM_PING_TIMEOUT) \ ENV(SWIM_PROTOCOL_PERIOD_LEN) \ ENV(SWIM_SUSPECT_TIMEOUT) \ + ENV_STR(SWIM_TRAFFIC_CLASS) \ ENV_STR(UCX_IB_FORK_INIT) /* uint env */ From 70b12e3331fea2c44beeba6792b316f1157eec51 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 25 Oct 2024 00:35:58 +0800 Subject: [PATCH 20/26] DAOS-16469 container: Lower log level for cont_aggregate_interval (#15283) To reduce the side-effect caused by frequent log with -DER_INPROGRESS. Signed-off-by: Fan Yong --- src/container/srv_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 35ac3d4285c5..caf2a4b2ee74 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, if (rc == -DER_SHUTDOWN) { break; /* pool destroyed */ } else if (rc < 0) { - DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc, + DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc, DF_CONT ": %s aggregate failed", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), param->ap_vos_agg ? "VOS" : "EC"); From 2a1892f02c08135e9d7d90d23c589df30068b330 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Thu, 24 Oct 2024 12:33:26 -0600 Subject: [PATCH 21/26] DAOS-16716 ci: Set reference build for PRs (#15379) Release branch PRs should use the release branch build instead of master branch build for Fault Injection reference Signed-off-by: Jeff Olivier --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 13ccc5128950..8b2c027b15f8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1038,7 +1038,7 @@ pipeline { } post { always { - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', + discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE recordIssues enabledForFailure: true, From 23f0787ab188704aeb0c0d000e2e181777831690 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Fri, 25 Oct 2024 08:51:07 -0700 Subject: [PATCH 22/26] DAOS-15914: crt_reply_send_input_free() (#14817) - New crt_reply_send_input_free() API added which releases input buffer right after HG_Respond() instead of waiting until the handle is destroyed. - srv_obj.c calls changed to use new crt_reply_send_input_free() - I/O context takes refcount on RPC - only release input buffer for target update Signed-off-by: Alexander A Oganezov Signed-off-by: Liang Zhen Co-authored-by: Liang Zhen --- src/cart/crt_hg.c | 10 +++++++++ src/cart/crt_rpc.c | 20 ++++++++++++++++++ src/cart/crt_rpc.h | 47 +++++++++++++++++++++--------------------- src/dtx/dtx_srv.c | 2 +- src/include/cart/api.h | 15 ++++++++++++++ src/object/srv_obj.c | 22 ++++++++++++++------ 6 files changed, 86 insertions(+), 30 deletions(-) diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 244373c9228e..26b54b52ec3d 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1482,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) rc = crt_hgret_2_der(hg_ret); } + /* Release input buffer */ + if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) { + hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl); + if (hg_ret != HG_SUCCESS) { + RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n", + DP_HG_RC(hg_ret)); + /* Fall through */ + } + } + return rc; } diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 72d8214aa09c..ead03d1bf290 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg) return rc; } +int +crt_reply_send_input_free(crt_rpc_t *req) +{ + struct crt_rpc_priv *rpc_priv = NULL; + int rc = 0; + + if (req == NULL) { + D_ERROR("invalid parameter (NULL req).\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + rpc_priv = container_of(req, struct crt_rpc_priv, crp_pub); + rpc_priv->crp_release_input_early = 1; + + return crt_reply_send(req); + +out: + return rc; +} + int crt_reply_send(crt_rpc_t *req) { diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index a5e6afee7800..3a5909331038 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -166,29 +166,30 @@ struct crt_rpc_priv { * match with crp_req_hdr.cch_flags. */ uint32_t crp_flags; - uint32_t crp_srv:1, /* flag of server received request */ - crp_output_got:1, - crp_input_got:1, - /* flag of collective RPC request */ - crp_coll:1, - /* flag of crp_tgt_uri need to be freed */ - crp_uri_free:1, - /* flag of forwarded rpc for corpc */ - crp_forward:1, - /* flag of in timeout binheap */ - crp_in_binheap:1, - /* set if a call to crt_req_reply pending */ - crp_reply_pending:1, - /* set to 1 if target ep is set */ - crp_have_ep:1, - /* RPC is tracked by the context */ - crp_ctx_tracked:1, - /* 1 if RPC fails HLC epsilon check */ - crp_fail_hlc:1, - /* RPC completed flag */ - crp_completed:1, - /* RPC originated from a primary provider */ - crp_src_is_primary:1; + uint32_t crp_srv : 1, /* flag of server received request */ + crp_output_got : 1, crp_input_got : 1, + /* flag of collective RPC request */ + crp_coll : 1, + /* flag of crp_tgt_uri need to be freed */ + crp_uri_free : 1, + /* flag of forwarded rpc for corpc */ + crp_forward : 1, + /* flag of in timeout binheap */ + crp_in_binheap : 1, + /* set if a call to crt_req_reply pending */ + crp_reply_pending : 1, + /* set to 1 if target ep is set */ + crp_have_ep : 1, + /* RPC is tracked by the context */ + crp_ctx_tracked : 1, + /* 1 if RPC fails HLC epsilon check */ + crp_fail_hlc : 1, + /* RPC completed flag */ + crp_completed : 1, + /* RPC originated from a primary provider */ + crp_src_is_primary : 1, + /* release input buffer early */ + crp_release_input_early : 1; struct crt_opc_info *crp_opc_info; /* corpc info, only valid when (crp_coll == 1) */ diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 9e78100c451b..0fd4e7c513f4 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -337,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc) dout->do_status = rc; /* For DTX_COMMIT, it is the count of real committed DTX entries. */ dout->do_misc = committed; - rc = crt_reply_send(rpc); + rc = crt_reply_send_input_free(rpc); if (rc != 0) D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc, DP_RC(rc)); diff --git a/src/include/cart/api.h b/src/include/cart/api.h index 21941d7cb983..16fc2091d4b2 100644 --- a/src/include/cart/api.h +++ b/src/include/cart/api.h @@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg); int crt_reply_send(crt_rpc_t *req); +/** + * Send an RPC reply and free the input buffer immediately. + * Only to be called on the server side. + * + * \param[in] req pointer to RPC request + * + * \return DER_SUCCESS on success, negative value if error + * + * \note the crt_rpc_t is exported to user, caller should fill the + * crt_rpc_t::cr_output before sending the RPC reply. + * See \ref crt_req_create. + */ +int +crt_reply_send_input_free(crt_rpc_t *req); + /** * Return request buffer * diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a51682b47853..f220149a5326 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc, } static void -obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, +obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input, struct obj_io_context *ioc) { struct obj_rw_out *orwo = crt_reply_get(rpc); @@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, ioc->ioc_map_ver, orwo->orw_epoch, status); if (!ioc->ioc_lost_reply) { - rc = crt_reply_send(rpc); + if (release_input) + rc = crt_reply_send_input_free(rpc); + else + rc = crt_reply_send(rpc); + if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } else { @@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc D_ASSERT(ioc != NULL); memset(ioc, 0, sizeof(*ioc)); + + crt_req_addref(rpc); ioc->ioc_rpc = rpc; ioc->ioc_opc = opc_get(rpc->cr_opc); rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh); @@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err) ds_cont_child_put(ioc->ioc_coc); ioc->ioc_coc = NULL; } + if (ioc->ioc_rpc) { + crt_req_decref(ioc->ioc_rpc); + ioc->ioc_rpc = NULL; + } } /* Setup lite IO context, it is only for compound RPC so far: @@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey, &iod->iod_name, &recx); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc) D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid), DP_RC(rc1)); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) out: if (dth != NULL) rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, true, &ioc); D_FREE(mbs); obj_ioc_end(&ioc, rc); } @@ -3073,7 +3083,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (ioc.ioc_map_ver < max_ver) ioc.ioc_map_ver = max_ver; - obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); + obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc); D_FREE(mbs); D_FREE(dti_cos); obj_ioc_end(&ioc, rc); From 67da3b9b924c16b7b240fcac94e6cdd3044d6415 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 25 Oct 2024 23:52:33 +0800 Subject: [PATCH 23/26] DAOS-16721 object: fix coll RPC for obj with sparse layout - b26 (#15376) The old implementation did not correctly calculate some collective object RPC size, and may cause trouble when need bulk data transfer for large collective object RPC. It also potentially affects how to dispatch collective RPCs from leader to other engines. The patch also addes more sanity check for coll-punch RPC to detect potential DRAM corruption. Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 6 ++- src/dtx/dtx_rpc.c | 5 +- src/object/cli_coll.c | 111 +++++++++++++++++++++++++------------- src/object/cli_shard.c | 7 +-- src/object/obj_internal.h | 8 ++- src/object/obj_utils.c | 2 +- src/object/srv_coll.c | 12 +++-- src/object/srv_obj.c | 8 +-- src/vos/vos_dtx.c | 3 -- 9 files changed, 107 insertions(+), 55 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 863307e9a7f6..7e7c81991551 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -80,8 +80,12 @@ dtx_coll_prep_ult(void *arg) DP_UUID(cont->sc_uuid), DP_RC(rc)); } - if (dcpa->dcpa_result != 0) + if (dcpa->dcpa_result != 0) { + if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n", + DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); goto out; + } dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, dci->dci_version, cont->sc_pool->spc_map_version, diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5423f6c0cc88..2ccbfec2734d 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -974,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (dsp->dsp_mbs == NULL) { rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc < 0 && rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) { + D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", + DP_DTI(&dsp->dsp_xid), DP_RC(rc)); goto out; + } drop = true; goto next; diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index e05abadf3cf4..3285bec58b34 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool if (coa->coa_sparse == 0) coa->coa_dct_cap = obj_ranks; } + + /* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */ + coa->coa_min_rank = obj->cob_min_rank; + D_RWLOCK_UNLOCK(&obj->cob_lock); if (coa->coa_sparse) { @@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool coa->coa_dct_nr = -1; } - coa->coa_max_dct_sz = 0; coa->coa_max_shard_nr = 0; coa->coa_max_bitmap_sz = 0; coa->coa_target_nr = 0; @@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa) coa->coa_dct_nr = 0; } +static void +obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct, + uint32_t *size, bool copy) +{ + struct daos_coll_shard *dcs; + uint32_t dct_size; + int i; + + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (i = 0; i <= dct->dct_max_shard; i++) { + dcs = &dct->dct_shards[i]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (copy) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; +} + +struct obj_coll_tree_args { + struct coll_oper_args *coa; + uint32_t *size; +}; + static int obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) { - struct coll_oper_args *coa = arg; - struct daos_coll_target *dct = val->iov_buf; + struct obj_coll_tree_args *octa = arg; + struct coll_oper_args *coa = octa->coa; + struct daos_coll_target *dct = val->iov_buf; D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap, "Too short pre-allcoated dct_array: %u vs %u\n", coa->coa_dct_nr, coa->coa_dct_cap); + D_ASSERT(dct->dct_bitmap != NULL); - memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct)); + obj_coll_collapse_one(coa, dct, octa->size, true); /* The following members have been migrated into coa->coa_dcts. */ dct->dct_bitmap = NULL; @@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) static int obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) { + struct obj_coll_tree_args octa; struct coll_sparse_targets *tree = coa->coa_tree; int rc = 0; @@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) D_GOTO(out, rc = -DER_NOMEM); coa->coa_sparse = 0; - rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa); + coa->coa_raw_sparse = 1; + coa->coa_dct_nr = 0; + coa->coa_max_dct_sz = 0; + + octa.coa = coa; + octa.size = size; + rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, + obj_coll_tree_cb, &octa); if (rc == 0) D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap, "Something is wrong when prepare coll target array: %u vs %u\n", @@ -287,36 +336,13 @@ static int obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size) { struct daos_coll_target *dct; - struct daos_coll_shard *dcs; - uint32_t dct_size; int i; - int j; - for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0; + i < coa->coa_dct_cap; i++) { dct = &coa->coa_dcts[i]; - if (dct->dct_bitmap != NULL) { - /* The size may be over estimated, no matter. */ - dct_size = sizeof(*dct) + dct->dct_bitmap_sz + - sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); - - for (j = 0; j <= dct->dct_max_shard; j++) { - dcs = &dct->dct_shards[j]; - if (dcs->dcs_nr > 1) - dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; - } - - if (coa->coa_for_modify) - dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; - - if (coa->coa_max_dct_sz < dct_size) - coa->coa_max_dct_sz = dct_size; - - if (coa->coa_dct_nr < i) - memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); - - coa->coa_dct_nr++; - *size += dct_size; - } + if (dct->dct_bitmap != NULL) + obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i); } /* Reset the other dct slots to avoid double free during cleanup. */ @@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, D_RWLOCK_RDLOCK(&obj->cob_lock); - D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank, - "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank); + D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank, + "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n", + DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver); D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank, "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank); @@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo uint32_t tgt_size = 0; uint32_t mbs_max_size; uint32_t inline_size; - uint32_t flags = ORF_LEADER; uint32_t leader = -1; uint32_t len; int rc; @@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo memcpy(dct, &tmp_tgt, sizeof(tmp_tgt)); } + /* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */ + D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank, + "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n", + DP_OID(obj->cob_md.omd_id), shard->do_target_rank, + coa->coa_dcts[0].dct_rank, coa->coa_dct_nr); + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); if (rc < 0) goto out; @@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc != 0) goto out; + auxi->flags = ORF_LEADER; if (auxi->io_retry) { - flags |= ORF_RESEND; + auxi->flags |= ORF_RESEND; /* Reset @enqueue_id if resend to new leader. */ if (spa->pa_auxi.target != shard->do_target_id) spa->pa_auxi.enqueue_id = 0; } else { + auxi->flags &= ~ORF_RESEND; spa->pa_auxi.obj_auxi = auxi; daos_dti_gen(&spa->pa_dti, false); } @@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo spa->pa_auxi.shard = shard->do_shard_idx; if (obj_is_ec(obj)) - flags |= ORF_EC; + auxi->flags |= ORF_EC; mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, - args->flags, flags, map_ver, &auxi->map_ver_reply, task); + args->flags, auxi->flags, map_ver, + &auxi->map_ver_reply, task); out: if (rc > 0) diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 36c5c5f1e0c3..0c9dfc1418e5 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1453,9 +1453,10 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), - DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, - (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, + *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ec6aa3b817e3..cfe3385e959c 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -292,10 +292,16 @@ struct coll_oper_args { struct shard_auxi_args coa_auxi; int coa_dct_nr; uint32_t coa_dct_cap; - uint32_t coa_max_dct_sz; + union { + /* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */ + uint32_t coa_min_rank; + /* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */ + uint32_t coa_max_dct_sz; + }; uint8_t coa_max_shard_nr; uint8_t coa_max_bitmap_sz; uint8_t coa_for_modify:1, + coa_raw_sparse:1, coa_sparse:1; uint8_t coa_target_nr; /* diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 7bf0ef4aaf93..82d91c966acc 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -633,7 +633,7 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t * use the "cur_pos" as the relay engine. */ pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; - if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { + if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); memcpy(dct, &tmp, sizeof(tmp)); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 2a152b47bd64..5b59f954f865 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct int i; int j; - /* dcts[0] is for current engine. */ - if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || - dcts[0].dct_shards == NULL) { + /* dcts[0] must be for current engine. */ + if (unlikely(dcts[0].dct_rank != dss_self_rank())) { + D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n", + dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags); + D_GOTO(out, rc = -DER_INVAL); + } + + if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL)) { D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n", dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz, dcts[0].dct_shards == NULL ? "empty" : "non-empty"); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index f220149a5326..041ea903c4fe 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -5615,12 +5615,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " - DF_DTI", forward width %u, forward depth %u\n", + DF_DTI", forward width %u, forward depth %u, flags %x\n", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); D_ASSERT(dmi->dmi_xs_id != 0); @@ -5757,13 +5757,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u", + "tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); obj_punch_complete(rpc, rc, max_ver); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index f1d69b54f4e4..0f87bfd4d0e3 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1973,9 +1973,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = -DER_INPROGRESS; } - if (rc < 0) - D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); - return rc; } From c4cf4f7b58f9315c2a294c899d807edb6b8917b0 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 28 Oct 2024 13:49:15 +0000 Subject: [PATCH 24/26] DAOS-16687 control: Handle missing PCIe caps in storage query usage (#15296) (#15392) Missing PCIe capabilities when querying a NVMe SSD's configuration space is unusual but should be handled gracefully by the control-plane and shouldn't cause a failure to return usage statistics when calling dmg storage query usage. Update so that pciutils lib is only called when attempting to display health stats via dmg and not when fetching usage info. Improve clarity of workflow to ease maintenance and add test coverage for updates. Enable continued functionality when NVMe device doesn't return any extended capabilities in PCIe configuration space data by adding sentinel error to library for such a case. Signed-off-by: Tom Nabarro --- .../common/proto/ctl/storage_nvme.pb.go | 46 +++-- src/control/lib/control/storage.go | 4 +- src/control/lib/hardware/pciutils/bindings.go | 3 +- src/control/server/ctl_smd_rpc_test.go | 49 +++++ src/control/server/instance_storage_rpc.go | 134 ++++++++++---- .../server/instance_storage_rpc_test.go | 172 ++++++++++++------ src/proto/ctl/storage_nvme.proto | 3 +- 7 files changed, 308 insertions(+), 103 deletions(-) diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go index 62fede43ed4c..cb2dc5099d45 100644 --- a/src/control/common/proto/ctl/storage_nvme.pb.go +++ b/src/control/common/proto/ctl/storage_nvme.pb.go @@ -95,11 +95,12 @@ type ScanNvmeReq struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics - Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device - Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic - MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob - RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics + Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device + Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic + MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob + RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + LinkStats bool `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics } func (x *ScanNvmeReq) Reset() { @@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 { return 0 } +func (x *ScanNvmeReq) GetLinkStats() bool { + if x != nil { + return x.LinkStats + } + return false +} + type ScanNvmeResp struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72, 0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, - 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, + 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, @@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, - 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, - 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, - 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, - 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, - 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, - 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, - 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, - 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, + 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, + 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, + 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, + 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, + 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, + 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, + 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, + 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 65942a6c12d2..9d5fe470de6a 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe // Health and meta details required to populate usage statistics. Health: req.NvmeHealth || req.Usage, Meta: req.Usage, + // Only request link stats if health explicitly requested. + LinkStats: req.NvmeHealth, }, }) }) diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go index 86686896c80d..f4d43c50fa31 100644 --- a/src/control/lib/hardware/pciutils/bindings.go +++ b/src/control/lib/hardware/pciutils/bindings.go @@ -40,6 +40,7 @@ var ( ErrMultiDevices = errors.New("want single device config got multiple") ErrCfgNotTerminated = errors.New("device config content not new-line terminated") ErrCfgMissing = errors.New("incomplete device config") + ErrNoPCIeCaps = errors.New("no pci-express capabilities found") ) // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of @@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL) if cp == nil { - return errors.New("no pci-express capabilities found") + return ErrNoPCIeCaps } cpAddr := uint32(cp.addr) diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index 6378e81d23cd..06f1276fa252 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -17,6 +17,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/drpc" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { drpcResps map[int][]*mockDrpcResponse harnessStopped bool ioStopped bool + pciDevErr error expResp *ctlpb.SmdQueryResp expErr error }{ @@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, expErr: daos.FreeMemError, }, + "list-devices; with health; update link stats": { + req: &ctlpb.SmdQueryReq{ + OmitPools: true, + Rank: uint32(ranklist.NilRank), + Uuid: test.MockUUID(1), + IncludeBioHealth: true, + }, + drpcResps: map[int][]*mockDrpcResponse{ + 0: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{pbNormDev(0)}, + }, + }, + }, + 1: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{ + func() *ctlpb.SmdDevice { + sd := pbFaultDev(1) + sd.Ctrlr.PciCfg = "ABCD" + return sd + }(), + }, + }, + }, + { + Message: &ctlpb.BioHealthResp{ + Temperature: 1000000, + TempWarn: true, + }, + }, + }, + }, + // Prove mock link stats provider gets called when IncludeBioHealth + // flag is set and Ctrlr.PciCfg string is not empty. + pciDevErr: errors.New("link stats provider fail"), + expErr: errors.New("link stats provider fail"), + }, "ambiguous UUID": { req: &ctlpb.SmdQueryReq{ Rank: uint32(ranklist.NilRank), @@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: tc.pciDevErr, + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + }() + engineCount := len(tc.drpcResps) if engineCount == 0 { engineCount = 1 diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index fbca1e534a08..3097440f500e 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -23,14 +23,19 @@ import ( "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/server/storage" ) var ( - scanSmd = listSmdDevices - getCtrlrHealth = getBioHealth + // Function pointers to enable mocking. + scanSmd = listSmdDevices + scanHealth = getBioHealth + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + + // Sentinel errors to enable comparison. errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance") + errCtrlrHealthSkipped = errors.New("controller health update was skipped") ) // newMntRet creates and populates SCM mount result. @@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe } func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error { + if health == nil { + return errors.New("nil BioHealthResp") + } + // Convert byte-string to lspci-format. sb := new(strings.Builder) formatBytestring(pciCfg, sb) pciDev := &hardware.PCIDevice{} if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil { - return err + return errors.Wrap(err, "pciutils lib") } // Copy link details from PCIDevice to health stats. @@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth lastMaxWidthStr, lastWidthStr, stats.LinkPortId) } -func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) { - stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)] - if !ctrlr.CanSupplyHealthStats() { - engine.Debugf("skip fetching health stats on device %q in %q state", - ctrlr.PciAddr, stateName) - return false, nil +type ctrlrHealthReq struct { + meta bool + engine Engine + bhReq *ctlpb.BioHealthReq + ctrlr *ctlpb.NvmeController + linkStatsProv hardware.PCIeLinkStatsProvider +} + +// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health +// resp if in a bad state. +func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) { + stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)] + if !req.ctrlr.CanSupplyHealthStats() { + req.engine.Debugf("skip fetching health stats on device %q in %q state", + req.ctrlr.PciAddr, stateName) + return nil, errCtrlrHealthSkipped } - health, err := getCtrlrHealth(ctx, engine, req) + health, err := scanHealth(ctx, req.engine, req.bhReq) if err != nil { - return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr, + return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr, stateName) } - if ctrlr.PciCfg != "" { - if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil { - return false, errors.Wrapf(err, "add link stats for %q", ctrlr) + return health, nil +} + +// Add link state and capability information to input health statistics for the given controller +// then if successful publish events based on link statistic changes. Link updated health stats to +// controller. +func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error { + err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health) + if err == nil { + publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health) + } else { + if errors.Cause(err) != pciutils.ErrNoPCIeCaps { + return errors.Wrapf(err, "add link stats for %q", req.ctrlr) + } + req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr) + } + + return nil +} + +// Update controller health statistics and include link info if required and available. +func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) { + health, err := getCtrlrHealth(ctx, req) + if err != nil { + if err == errCtrlrHealthSkipped { + // Nothing to do. + return false, nil } - publishLinkStatEvents(engine, ctrlr.PciAddr, health) + return false, errors.Wrap(err, "get ctrlr health") + } + + if req.linkStatsProv == nil { + req.engine.Debugf("device %q skip adding link stats; nil provider", + req.ctrlr.PciAddr) + } else if req.ctrlr.PciCfg == "" { + req.engine.Debugf("device %q skip adding link stats; empty pci cfg", + req.ctrlr.PciAddr) } else { - engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr) + if err = setCtrlrHealthWithLinkInfo(req, health); err != nil { + return false, errors.Wrap(err, "set ctrlr health") + } } - ctrlr.HealthStats = health - ctrlr.PciCfg = "" + req.ctrlr.HealthStats = health return true, nil } @@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc c := seenCtrlrs[addr] - // Only minimal info provided in standard scan to enable result aggregation across - // homogeneous hosts. engineRank, err := engine.GetRank() if err != nil { return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) } + + // Only provide minimal info in standard scan to enable result aggregation across + // homogeneous hosts. nsd := &ctlpb.SmdDevice{ RoleBits: sd.RoleBits, CtrlrNamespaceId: sd.CtrlrNamespaceId, @@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc // Populate health if requested. healthUpdated := false if pbReq.Health && c.HealthStats == nil { - bhReq := &ctlpb.BioHealthReq{ - DevUuid: sd.Uuid, - MetaSize: pbReq.MetaSize, - RdbSize: pbReq.RdbSize, + bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid} + if pbReq.Meta { + bhReq.MetaSize = pbReq.MetaSize + bhReq.RdbSize = pbReq.RdbSize } - upd, err := populateCtrlrHealth(ctx, engine, bhReq, c, - hwprov.DefaultPCIeLinkStatsProvider()) + + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: bhReq, + ctrlr: c, + } + if pbReq.LinkStats { + // Add link stats to health if flag set. + chReq.linkStatsProv = linkStatsProv + } + + healthUpdated, err = populateCtrlrHealth(ctx, chReq) if err != nil { return nil, err } - healthUpdated = upd } + // Used to update health with link stats, now redundant. + c.PciCfg = "" // Populate usage data if requested. if pbReq.Meta { @@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq continue // Skip health query if UUID doesn't match requested. } if pbReq.IncludeBioHealth { - bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid} - if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr, - hwprov.DefaultPCIeLinkStatsProvider()); err != nil { + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: &ctlpb.BioHealthReq{DevUuid: dev.Uuid}, + ctrlr: dev.Ctrlr, + linkStatsProv: linkStatsProv, + } + + if _, err = populateCtrlrHealth(ctx, chReq); err != nil { return nil, err } } + // Used to update health with link stats, now redundant. + dev.Ctrlr.PciCfg = "" + if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid { rResp.Devices = []*ctlpb.SmdDevice{dev} found = true diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go index b199adb6b8d3..0f72f739970b 100644 --- a/src/control/server/instance_storage_rpc_test.go +++ b/src/control/server/instance_storage_rpc_test.go @@ -21,6 +21,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { } for name, tc := range map[string]struct { - badDevState bool - noPciCfgSpc bool - pciDev *hardware.PCIDevice - pciDevErr error - emptyHealthRes bool - healthErr error - lastStats map[string]*ctlpb.BioHealthResp - expCtrlr *ctlpb.NvmeController - expNotUpdated bool - expErr error - expDispatched []*events.RASEvent - expLastStats map[string]*ctlpb.BioHealthResp + badDevState bool + nilLinkStatsProv bool + noPciCfgSpc bool + pciDev *hardware.PCIDevice + pciDevErr error + healthReq *ctlpb.BioHealthReq + healthRes *ctlpb.BioHealthResp + nilHealthRes bool + healthErr error + lastStats map[string]*ctlpb.BioHealthResp + expCtrlr *ctlpb.NvmeController + expNotUpdated bool + expErr error + expDispatched []*events.RASEvent + expLastStats map[string]*ctlpb.BioHealthResp }{ "bad state; skip health": { badDevState: true, - noPciCfgSpc: true, expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", }, expNotUpdated: true, }, @@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { HealthStats: healthWithLinkStats(0, 0, 0, 0), }, }, + "nil bio health response": { + nilHealthRes: true, + expErr: errors.New("nil BioHealthResp"), + }, "empty bio health response; empty link stats": { - emptyHealthRes: true, - pciDev: new(hardware.PCIDevice), + healthRes: new(ctlpb.BioHealthResp), + pciDev: new(hardware.PCIDevice), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: new(ctlpb.BioHealthResp), }, @@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { pciDevErr: errors.New("fail"), expErr: errors.New("fail"), }, + "update health; add link stats; pciutils lib error; missing pcie caps": { + pciDevErr: pciutils.ErrNoPCIeCaps, + expCtrlr: &ctlpb.NvmeController{ + PciAddr: pciAddr, + PciCfg: "ABCD", + DevState: ctlpb.NvmeDevState_NORMAL, + HealthStats: healthWithLinkStats(0, 0, 0, 0), + }, + }, "update health; add link stats; normal link state; no event published": { expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(proto.MockNvmeHealth()), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8), }, @@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4), }, @@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4), }, @@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4), }, @@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - healthRes := healthWithLinkStats(0, 0, 0, 0) - if tc.emptyHealthRes { - healthRes = new(ctlpb.BioHealthResp) - } - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { - return healthRes, tc.healthErr + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth }() var devState ctlpb.NvmeDevState @@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { LinkMaxWidth: 4, } } + if tc.healthRes == nil && !tc.nilHealthRes { + tc.healthRes = healthWithLinkStats(0, 0, 0, 0) + } - mockProv := &mockPCIeLinkStatsProvider{ - pciDev: tc.pciDev, - pciDevErr: tc.pciDevErr, + var mockProv *mockPCIeLinkStatsProvider + if !tc.nilLinkStatsProv { + mockProv = &mockPCIeLinkStatsProvider{ + pciDev: tc.pciDev, + pciDevErr: tc.pciDevErr, + } } ctrlr := &ctlpb.NvmeController{ @@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { subscriber := newMockSubscriber(2) ps.Subscribe(events.RASTypeInfoOnly, subscriber) - upd, err := populateCtrlrHealth(test.Context(t), ei, - &ctlpb.BioHealthReq{}, ctrlr, mockProv) + chReq := ctrlrHealthReq{ + engine: ei, + bhReq: tc.healthReq, + ctrlr: ctrlr, + linkStatsProv: mockProv, + } + + upd, err := populateCtrlrHealth(test.Context(t), chReq) test.CmpErr(t, tc.expErr, err) if err != nil { return @@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { } defSmdScanRes := func() *ctlpb.SmdDevResp { sd := proto.MockSmdDevice(c, 2) + sd.Rank = 2 return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}} } healthRespWithUsage := func() *ctlpb.BioHealthResp { @@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { mh.MetaWalSize, mh.RdbWalSize = 4, 5 return mh } + ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = healthRespWithUsage() + sd := proto.MockSmdDevice(nil, 2) + sd.Rank = 1 + sd.TotalBytes = c.HealthStats.TotalBytes + sd.AvailBytes = c.HealthStats.AvailBytes + sd.ClusterSize = c.HealthStats.ClusterSize + sd.MetaWalSize = c.HealthStats.MetaWalSize + sd.RdbWalSize = c.HealthStats.RdbWalSize + c.SmdDevices = []*ctlpb.SmdDevice{sd} + return c + } for name, tc := range map[string]struct { req ctlpb.ScanNvmeReq @@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; usage and wal size reported": { + "scan over drpc; with meta and health; usage and wal size reported": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ - func() *ctlpb.NvmeController { - c := proto.MockNvmeController(2) - c.HealthStats = healthRespWithUsage() - sd := proto.MockSmdDevice(nil, 2) - sd.Rank = 1 - sd.TotalBytes = c.HealthStats.TotalBytes - sd.AvailBytes = c.HealthStats.AvailBytes - sd.ClusterSize = c.HealthStats.ClusterSize - sd.MetaWalSize = c.HealthStats.MetaWalSize - sd.RdbWalSize = c.HealthStats.RdbWalSize - c.SmdDevices = []*ctlpb.SmdDevice{sd} - return c - }(), - }, - State: new(ctlpb.ResponseState), + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), }, }, "scan over drpc; only ctrlrs with valid states shown": { @@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; missing ctrlr in smd": { + "scan over drpc; with meta and health; missing ctrlr in smd": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: func() *ctlpb.SmdDevResp { ssr := defSmdScanRes() @@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { healthRes: healthRespWithUsage(), expErr: errors.New("no ctrlr ref"), }, - "scan over drpc; with smd and health; health scan fails": { + "scan over drpc; with meta and health; health scan fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: defSmdScanRes(), healthErr: errors.New("health scan failed"), expErr: errors.New("health scan failed"), }, - "scan over drpc; with smd and health; smd list fails": { + "scan over drpc; with meta and health; smd list fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdErr: errors.New("smd scan failed"), healthRes: healthRespWithUsage(), expErr: errors.New("smd scan failed"), }, - "scan over drpc; with smd and health; nil smd list returned": { + "scan over drpc; with meta and health; nil smd list returned": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, healthRes: healthRespWithUsage(), expErr: errors.New("nil smd scan resp"), }, + "scan over drpc; with meta and health; link info update skipped": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + rank: 1, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with health; link info update run but failed": { + req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true}, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + // Prove link stat provider gets called when LinkStats flag set. + expErr: errors.New("link stats provider fail"), + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { defer func() { scanSmd = listSmdDevices }() - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth + }() + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: errors.New("link stats provider fail"), + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() }() if tc.provRes == nil { @@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { ei.setSuperblock(nil) } else { ei.setSuperblock(&Superblock{ - Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true, + Rank: ranklist.NewRankPtr(uint32(tc.rank)), + ValidRank: true, }) } diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto index 944d8e943bae..be068e5274d0 100644 --- a/src/proto/ctl/storage_nvme.proto +++ b/src/proto/ctl/storage_nvme.proto @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -28,6 +28,7 @@ message ScanNvmeReq { bool Basic = 3; // Strip NVMe device details to only basic uint64 MetaSize = 4; // Size of the metadata blob uint64 RdbSize = 5; // Size of the RDB blob + bool LinkStats = 6; // Populate PCIe link info in health statistics } message ScanNvmeResp { From eb95b5555016ccb40c6915ededd17943f652f1e0 Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Mon, 28 Oct 2024 09:00:18 -0500 Subject: [PATCH 25/26] DAOS-16722 client: to intercept PMPI_Init() in libpil4dfs (#15387) Intercept PMPI_Init() to avoid calling daos_init() if MPI_Init() is intercepted by other library (like darshan and mpip). Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/int_dfs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 3087cb09c75f..b69449f1e3d1 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -470,6 +470,7 @@ static int (*next_tcgetattr)(int fd, void *termios_p); /* end NOT supported by DAOS */ static int (*next_mpi_init)(int *argc, char ***argv); +static int (*next_pmpi_init)(int *argc, char ***argv); /* to do!! */ /** @@ -1041,6 +1042,22 @@ MPI_Init(int *argc, char ***argv) return rc; } +int +PMPI_Init(int *argc, char ***argv) +{ + int rc; + + if (next_pmpi_init == NULL) { + next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init"); + D_ASSERT(next_pmpi_init != NULL); + } + + atomic_fetch_add_relaxed(&mpi_init_count, 1); + rc = next_pmpi_init(argc, argv); + atomic_fetch_add_relaxed(&mpi_init_count, -1); + return rc; +} + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. From bde13c310ef66e302d3bfb55af2b252a67386cb7 Mon Sep 17 00:00:00 2001 From: mjean308 <48688872+mjean308@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:25:47 -0400 Subject: [PATCH 26/26] DAOS-15943 test: Remove server logging from pre-teardown (#15282) (#15386) Signed-off-by: Maureen Jean --- src/tests/ftest/util/soak_test_base.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index f32e068cb167..2d1c11e722e9 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -25,8 +25,8 @@ from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, - create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers, - get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot, + create_racer_cmdline, ddhhmmss_format, get_harassers, + launch_exclude_reintegrate, launch_extend, launch_reboot, launch_server_stop_start, launch_snapshot, launch_vmd_identify_check, reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check) @@ -164,17 +164,6 @@ def pre_tear_down(self): # display final metrics run_metrics_check(self, prefix="final") - # Gather server logs - try: - get_daos_server_logs(self) - except SoakTestError as error: - errors.append(f"<>") - # Gather journalctl logs - hosts = list(set(self.hostlist_servers)) - since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time)) - until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time)) - for journalctl_type in ["kernel", "daos_server"]: - get_journalctl(self, hosts, since, until, journalctl_type, logging=True) if self.all_failed_harassers: errors.extend(self.all_failed_harassers)