From 76cfb41b852f94859a32caaa7b3b379f069d0881 Mon Sep 17 00:00:00 2001
From: wiliamhuang <lei.huang@intel.com>
Date: Tue, 15 Oct 2024 10:18:18 -0500
Subject: [PATCH 01/26] DAOS-16556 client: call fstat() before mmap() to update
 file status in kernel (#15304)

Signed-off-by: Lei Huang <lei.huang@intel.com>
---
 src/client/dfuse/pil4dfs/int_dfs.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index fff49d9f0ae7..3087cb09c75f 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -6224,6 +6224,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 
 	atomic_fetch_add_relaxed(&num_mmap, 1);
 
+	if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) {
+		/* DAOS-14494: Force the kernel to update the size before mapping. */
+		rc = next_fxstat(1, fd, &stat_buf);
+		if (rc == -1)
+			return MAP_FAILED;
+		return next_mmap(addr, length, prot, flags, fd, offset);
+	}
+
 	addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset);
 	if (addr_ret == MAP_FAILED)
 		return MAP_FAILED;
@@ -6842,10 +6850,8 @@ init_myhook(void)
 
 	register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl));
 
-	if (d_compatible_mode == false) {
-		register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
-		register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
-	}
+	register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
+	register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
 
 	register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit));
 	register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3));

From b4eb68912b13345905efc98bdf2ed9b1c948113d Mon Sep 17 00:00:00 2001
From: Makito Kano <makito.kano@intel.com>
Date: Wed, 16 Oct 2024 09:40:29 +0900
Subject: [PATCH 02/26] =?UTF-8?q?DAOS-16446=20test:=20HDF5-VOL=20test=20-?=
 =?UTF-8?q?=20Set=20object=20class=20and=20container=20prope=E2=80=A6=20(#?=
 =?UTF-8?q?15004)=20(#15098)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In HDF5, DFS, MPIIO, or POSIX, object class and container properties are defined
during the container create. If it’s DFS, object class is also set to the IOR
parameter. However, in HDF5-VOL, object class and container properties are
defined with the following environment variables of mpirun.

HDF5_DAOS_OBJ_CLASS (Object class)
HDF5_DAOS_FILE_PROP (Container properties)

The infrastructure to set these variables are already there in run_ior_with_pool().
In file_count_test_base.py, pass in the env vars to run_ior_with_pool(env=env) as a
dictionary. Object class is the oclass variable. Container properties can be
obtained from self.container.properties.value.

This fix is discussed in PR #14964.

Signed-off-by: Makito Kano <makito.kano@intel.com>
---
 src/tests/ftest/util/file_count_test_base.py | 31 ++++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py
index 12c66d76b8cc..f95e22bd4bdd 100644
--- a/src/tests/ftest/util/file_count_test_base.py
+++ b/src/tests/ftest/util/file_count_test_base.py
@@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase):
     :avocado: recursive
     """
 
-    def add_containers(self, file_oclass=None, dir_oclass=None):
-        """Create a list of containers that the various jobs use for storage.
+    def get_file_write_container(self, file_oclass=None, dir_oclass=None):
+        """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass.
 
         Args:
-            file_oclass (str, optional): file object class of container.
-                                         Defaults to None.
-            dir_oclass (str, optional): dir object class of container.
-                                        Defaults to None.
+            file_oclass (str, optional): file object class of container. Defaults to None.
+            dir_oclass (str, optional): dir object class of container. Defaults to None.
 
+        Returns:
+            TestContainer: Created container with oclass, dir_oclass, and rd_fac set.
 
         """
         # Create a container and add it to the overall list of containers
@@ -92,7 +92,7 @@ def run_file_count(self):
                     rd_fac = extract_redundancy_factor(oclass)
                     dir_oclass = self.get_diroclass(rd_fac)
                     self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass)
-                    self.container = self.add_containers(oclass, dir_oclass)
+                    self.container = self.get_file_write_container(oclass, dir_oclass)
                     try:
                         self.processes = mdtest_np
                         self.ppn = mdtest_ppn
@@ -111,14 +111,27 @@ def run_file_count(self):
                 # run ior
                 self.log.info("=======>>>Starting IOR with %s and %s", api, oclass)
                 self.ior_cmd.dfs_oclass.update(oclass)
-                self.container = self.add_containers(oclass)
+                self.container = self.get_file_write_container(oclass)
                 self.update_ior_cmd_with_pool(False)
                 try:
                     self.processes = ior_np
                     self.ppn = ior_ppn
                     if api == 'HDF5-VOL':
+                        # Format the container properties so that it works with HDF5-VOL env var.
+                        # Each entry:value pair needs to be separated by a semicolon. Since we're
+                        # using this in the mpirun command, semicolon would indicate the end of the
+                        # command, so quote the whole thing.
+                        cont_props = self.container.properties.value
+                        cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"'
+                        self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol)
+                        env = self.ior_cmd.env.copy()
+                        env.update({
+                            "HDF5_DAOS_OBJ_CLASS": oclass,
+                            "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol
+                        })
                         self.ior_cmd.api.update('HDF5')
-                        self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path)
+                        self.run_ior_with_pool(
+                            create_pool=False, plugin_path=hdf5_plugin_path, env=env)
                     elif self.ior_cmd.api.value == 'POSIX':
                         self.run_ior_with_pool(create_pool=False, intercept=intercept)
                     else:

From 6e16c8e21bcc1cb5b29c62aca45e309201c0d9ca Mon Sep 17 00:00:00 2001
From: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
Date: Wed, 16 Oct 2024 02:43:26 +0200
Subject: [PATCH 03/26] DAOS-16673 common: ignore Hadoop 3.4.0 related CVE
 (#15320)

Hadoope 3.4.0 has resolved a few CVE issues but introduces new

+ enable Trivy scans on release branch
+ enable on demand scan and scan on final PR merge.

Signed-off-by: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
---
 .github/workflows/trivy.yml |  70 ++++++++++
 utils/trivy/.trivyignore    |  27 ++++
 utils/trivy/csv.tpl         |  29 +++++
 utils/trivy/trivy.yaml      | 249 ++++++++++++++++++++++++++++++++++++
 4 files changed, 375 insertions(+)
 create mode 100644 .github/workflows/trivy.yml
 create mode 100644 utils/trivy/.trivyignore
 create mode 100644 utils/trivy/csv.tpl
 create mode 100644 utils/trivy/trivy.yaml

diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
new file mode 100644
index 000000000000..8f5524d4513c
--- /dev/null
+++ b/.github/workflows/trivy.yml
@@ -0,0 +1,70 @@
+name: Trivy scan
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["master", "release/**"]
+  pull_request:
+    branches: ["master", "release/**"]
+
+# Declare default permissions as nothing.
+permissions: {}
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Prepare the report to be uploaded to the GitHub artifact store
+        run: |
+          mkdir report
+          cp trivy-report-daos.txt report
+          cp utils/trivy/.trivyignore report/trivyignore.txt
+
+      - name: Upload the report to the GitHub artifact store
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808  # v4.3.3
+        with:
+          path: report/*
+          name: trivy-report-daos
+
+      - name: Adjust config file to use sarif format
+        run: |
+          sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \
+            utils/trivy/trivy.yaml
+          sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a
+        # 3.25.15 (v3)
+        with:
+          sarif_file: 'trivy-results.sarif'
+
+      - name: Adjust config file to show and validate scan results
+        run: |
+          sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml
+          sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml
+          sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore
new file mode 100644
index 000000000000..3a3b4cff1cee
--- /dev/null
+++ b/utils/trivy/.trivyignore
@@ -0,0 +1,27 @@
+## Ignored hadoop 3.3.6 related CVE
+## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428
+CVE-2023-52428
+## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410
+CVE-2023-39410
+## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710
+CVE-2024-25710
+## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308
+CVE-2024-26308
+## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131
+CVE-2024-29131
+## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133
+CVE-2024-29133
+## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638
+CVE-2024-25638
+
+## Ignored hadoop 3.4.0 related CVE
+## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561
+CVE-2024-47561
+## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential  blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201
+CVE-2023-33201
+## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857
+CVE-2024-29857
+## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171
+CVE-2024-30171
+## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172
+CVE-2024-30172
diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl
new file mode 100644
index 000000000000..d3bcafcef5e9
--- /dev/null
+++ b/utils/trivy/csv.tpl
@@ -0,0 +1,29 @@
+{{ range . }}
+Trivy Vulnerability Scan Results ({{- .Target -}})
+VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information
+{{ range .Vulnerabilities }}
+    {{- .VulnerabilityID }},
+    {{- .Severity }},
+    {{- range $key, $value := .CVSS }}
+        {{- if (eq $key "nvd") }}
+            {{- .V3Score -}}
+        {{- end }}
+    {{- end }},
+    {{- quote .Title }},
+    {{- quote .PkgName }},
+    {{- quote .InstalledVersion }},
+    {{- quote .FixedVersion }},
+    {{- .PrimaryURL }}
+{{ else -}}
+    No vulnerabilities found at this time.
+{{ end }}
+Trivy Dependency Scan Results ({{ .Target }})
+ID,Name,Version,Notes
+{{ range .Packages -}}
+    {{- quote .ID }},
+    {{- quote .Name }},
+    {{- quote .Version }}
+{{ else -}}
+    No dependencies found at this time.
+{{ end }}
+{{ end }}
\ No newline at end of file
diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml
new file mode 100644
index 000000000000..cfb13b5c40f4
--- /dev/null
+++ b/utils/trivy/trivy.yaml
@@ -0,0 +1,249 @@
+cache:
+  backend: fs
+  dir:
+  redis:
+    ca: ""
+    cert: ""
+    key: ""
+    tls: false
+    ttl: 0s
+config: trivy.yaml
+db:
+  download-java-only: false
+  download-only: false
+  java-repository: ghcr.io/aquasecurity/trivy-java-db
+  java-skip-update: false
+  no-progress: false
+  repository: ghcr.io/aquasecurity/trivy-db
+  skip-update: false
+debug: false
+dependency-tree: true
+exit-code: 0
+generate-default-config: false
+ignore-policy: ""
+ignorefile: ./utils/trivy/.trivyignore
+include-dev-deps: false
+insecure: false
+license:
+  confidencelevel: "0.9"
+  forbidden:
+    - AGPL-1.0
+    - AGPL-3.0
+    - CC-BY-NC-1.0
+    - CC-BY-NC-2.0
+    - CC-BY-NC-2.5
+    - CC-BY-NC-3.0
+    - CC-BY-NC-4.0
+    - CC-BY-NC-ND-1.0
+    - CC-BY-NC-ND-2.0
+    - CC-BY-NC-ND-2.5
+    - CC-BY-NC-ND-3.0
+    - CC-BY-NC-ND-4.0
+    - CC-BY-NC-SA-1.0
+    - CC-BY-NC-SA-2.0
+    - CC-BY-NC-SA-2.5
+    - CC-BY-NC-SA-3.0
+    - CC-BY-NC-SA-4.0
+    - Commons-Clause
+    - Facebook-2-Clause
+    - Facebook-3-Clause
+    - Facebook-Examples
+    - WTFPL
+  full: false
+  ignored: []
+  notice:
+    - AFL-1.1
+    - AFL-1.2
+    - AFL-2.0
+    - AFL-2.1
+    - AFL-3.0
+    - Apache-1.0
+    - Apache-1.1
+    - Apache-2.0
+    - Artistic-1.0-cl8
+    - Artistic-1.0-Perl
+    - Artistic-1.0
+    - Artistic-2.0
+    - BSL-1.0
+    - BSD-2-Clause-FreeBSD
+    - BSD-2-Clause-NetBSD
+    - BSD-2-Clause
+    - BSD-3-Clause-Attribution
+    - BSD-3-Clause-Clear
+    - BSD-3-Clause-LBNL
+    - BSD-3-Clause
+    - BSD-4-Clause
+    - BSD-4-Clause-UC
+    - BSD-Protection
+    - CC-BY-1.0
+    - CC-BY-2.0
+    - CC-BY-2.5
+    - CC-BY-3.0
+    - CC-BY-4.0
+    - FTL
+    - ISC
+    - ImageMagick
+    - Libpng
+    - Lil-1.0
+    - Linux-OpenIB
+    - LPL-1.02
+    - LPL-1.0
+    - MS-PL
+    - MIT
+    - NCSA
+    - OpenSSL
+    - PHP-3.01
+    - PHP-3.0
+    - PIL
+    - Python-2.0
+    - Python-2.0-complete
+    - PostgreSQL
+    - SGI-B-1.0
+    - SGI-B-1.1
+    - SGI-B-2.0
+    - Unicode-DFS-2015
+    - Unicode-DFS-2016
+    - Unicode-TOU
+    - UPL-1.0
+    - W3C-19980720
+    - W3C-20150513
+    - W3C
+    - X11
+    - Xnet
+    - Zend-2.0
+    - zlib-acknowledgement
+    - Zlib
+    - ZPL-1.1
+    - ZPL-2.0
+    - ZPL-2.1
+  permissive: []
+  reciprocal:
+    - APSL-1.0
+    - APSL-1.1
+    - APSL-1.2
+    - APSL-2.0
+    - CDDL-1.0
+    - CDDL-1.1
+    - CPL-1.0
+    - EPL-1.0
+    - EPL-2.0
+    - FreeImage
+    - IPL-1.0
+    - MPL-1.0
+    - MPL-1.1
+    - MPL-2.0
+    - Ruby
+  restricted:
+    - BCL
+    - CC-BY-ND-1.0
+    - CC-BY-ND-2.0
+    - CC-BY-ND-2.5
+    - CC-BY-ND-3.0
+    - CC-BY-ND-4.0
+    - CC-BY-SA-1.0
+    - CC-BY-SA-2.0
+    - CC-BY-SA-2.5
+    - CC-BY-SA-3.0
+    - CC-BY-SA-4.0
+    - GPL-1.0
+    - GPL-2.0
+    - GPL-2.0-with-autoconf-exception
+    - GPL-2.0-with-bison-exception
+    - GPL-2.0-with-classpath-exception
+    - GPL-2.0-with-font-exception
+    - GPL-2.0-with-GCC-exception
+    - GPL-3.0
+    - GPL-3.0-with-autoconf-exception
+    - GPL-3.0-with-GCC-exception
+    - LGPL-2.0
+    - LGPL-2.1
+    - LGPL-3.0
+    - NPL-1.0
+    - NPL-1.1
+    - OSL-1.0
+    - OSL-1.1
+    - OSL-2.0
+    - OSL-2.1
+    - OSL-3.0
+    - QPL-1.0
+    - Sleepycat
+  unencumbered:
+    - CC0-1.0
+    - Unlicense
+    - 0BSD
+list-all-pkgs: false
+misconfiguration:
+  cloudformation:
+    params: []
+  helm:
+    set: []
+    set-file: []
+    set-string: []
+    values: []
+  include-non-failures: false
+  check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0
+  # scanners:
+  #  - azure-arm
+  #  - cloudformation
+  #  - dockerfile
+  #  - helm
+  #  - kubernetes
+  #  - terraform
+  #  - terraformplan
+  terraform:
+    exclude-downloaded-modules: false
+    vars: []
+module:
+  dir:
+  enable-modules: []
+output: "trivy-report-daos.txt"
+format: template
+template: '@./utils/trivy/csv.tpl'
+output-plugin-arg: ""
+quiet: false
+registry:
+  password: []
+  token: ""
+  username: []
+rego:
+  data: []
+  namespaces: []
+  policy: []
+  skip-policy-update: false
+  trace: false
+report: all
+scan:
+  compliance: ""
+  file-patterns: []
+  offline: false
+  parallel: 1
+  rekor-url: https://rekor.sigstore.dev
+  sbom-sources: []
+  scanners:
+    - vuln
+    - secret
+  # ignore all hadoop dependencies
+  skip-dirs:
+    ./src/client/java/hadoop-daos
+  skip-files: []
+  show-suppressed: true
+secret:
+  config: trivy-secret.yaml
+server:
+  addr: ""
+  custom-headers: []
+  token: ""
+  token-header: Trivy-Token
+severity:
+  - UNKNOWN
+  - MEDIUM
+  - HIGH
+  - CRITICAL
+timeout: 5m0s
+version: false
+vulnerability:
+  ignore-status: []
+  ignore-unfixed: false
+  type:
+    - os
+    - library

From d9f16a1d6f7326ace19b1f7d4a6f182b4bde0020 Mon Sep 17 00:00:00 2001
From: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
Date: Wed, 16 Oct 2024 02:48:45 +0200
Subject: [PATCH 04/26] DAOS-14408 common: ensure NDCTL not used for storage
 class `ram` (#15203)

* DAOS-14408 common: enable NDCTL for DCPM

This PR prepares DAOS to be used with NDCTL enabled in PMDK, which means:
- NDCTL must not be used when non-DCPM (simulate PMem) - `storage class: "ram"` is used:
`PMEMOBJ_CONF=sds.at_create=0` env variable disables NDCTL features in the PMDK
This change affects all tests run on simulated PMem (e.g. inside VMs).
Some DOAS utility applications may also require `PMEMOBJ_CONF=sds.at_create=0` to be set.

- The default ULT stack size must be at least 20KiB to avoid stack overuse by PMDK with NDCTL enabled and be aligned with Linux page size.
`ABT_THREAD_STACKSIZE=20480` env variable is used to increase the default ULT stack size.
This env variable is set by control/server module just before engine is started.
Much bigger stack is used for pmempool open/create-related tasks e.g. `tgt_vos_create_one` to avoid stack overusage.

This modification shall not affect md-on-ssd mode as long as `storage class: "ram"` is used for the first tier in the `storage` configuration.
This change does not require any configuration changes to existing systems.

The new PMDK package with NDCTL enabled (https://github.com/daos-stack/pmdk/pull/38) will land as soon as this PR is merged.

Signed-off-by: Jan Michalski <jan.michalski@intel.com>
---
 .github/workflows/landing-builds.yml     |   1 +
 debian/changelog                         |  16 ++
 debian/control                           |   6 +-
 site_scons/components/__init__.py        |   1 -
 src/control/server/engine/config.go      |  86 ++++++++++
 src/control/server/engine/config_test.go | 207 +++++++++++++++++++++++
 src/control/server/server.go             |   6 +
 src/mgmt/srv_target.c                    |   2 +-
 utils/build.config                       |   3 +-
 utils/rpms/daos.rpmlintrc                |  10 +-
 utils/rpms/daos.spec                     |  29 +++-
 utils/run_utest.py                       |   1 +
 utils/scripts/install-el8.sh             |   2 +
 utils/scripts/install-el9.sh             |   2 +
 utils/scripts/install-leap15.sh          |   1 +
 utils/scripts/install-ubuntu.sh          |   2 +
 16 files changed, 361 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml
index 103f150915a4..814c4f0e71f1 100644
--- a/.github/workflows/landing-builds.yml
+++ b/.github/workflows/landing-builds.yml
@@ -16,6 +16,7 @@ on:
       - ci/**
       - requirements-build.txt
       - requirements-utest.txt
+      - utils/build.config
 
 permissions: {}
 
diff --git a/debian/changelog b/debian/changelog
index 6891cea4737f..9790dcbee3e6 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,19 @@
+daos (2.6.1-4) unstable; urgency=medium
+  [ Tomasz Gromadzki ]
+  * Add support of the PMDK package 2.1.0 with NDCTL enabled.
+    * Increase the default ULT stack size to 20KiB if the engine uses
+      the DCPM storage class.
+    * Prevent using the RAM storage class (simulated PMem) when
+      the shutdown state (SDS) is active.
+      * Automatically disable SDS for the RAM storage class on engine startup.
+      * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0'
+        environment variable to deactivate SDS for the DAOS tools
+        (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM.
+        Otherwise, a user is supposed to be stopped by an error
+        like: "Unsafe shutdown count is not supported for this source".
+
+ -- Tomasz Gromadzki <tomasz.gromadzki@intel.com>  Wed, 02 Oct 2024 12:00:00 +0200
+
 daos (2.6.1-3) unstable; urgency=medium
   [ Phillip Henderson ]
   * Third release candidate for 2.6.1
diff --git a/debian/control b/debian/control
index bdc377e6d94a..32b97fd67d18 100644
--- a/debian/control
+++ b/debian/control
@@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10),
                python3-distro,
                libabt-dev,
                libucx-dev,
-               libpmemobj-dev (>= 2.0.0),
+               libpmemobj-dev (>= 2.1.0),
                libfuse3-dev,
                libprotobuf-c-dev,
                libjson-c-dev,
@@ -118,7 +118,9 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml,
          daos-client (= ${binary:Version}),
          daos-admin (= ${binary:Version}),
          golang-go (>=1.18),
-         libcapstone-dev
+         libcapstone-dev,
+         libndctl-dev,
+         libdaxctl-dev
 Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source
  software-defined object store designed from the ground up for
  massively distributed Non Volatile Memory (NVM). DAOS takes advantage
diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py
index ed50b80461b5..1ae7bb7a2aa3 100644
--- a/site_scons/components/__init__.py
+++ b/site_scons/components/__init__.py
@@ -266,7 +266,6 @@ def define_components(reqs):
                 retriever=GitRepoRetriever(),
                 commands=[['make',
                            'all',
-                           'NDCTL_ENABLE=n',
                            'BUILD_EXAMPLES=n',
                            'BUILD_BENCHMARKS=n',
                            'DOC=n',
diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go
index 3e6ab0b64d79..38595fd8091a 100644
--- a/src/control/server/engine/config.go
+++ b/src/control/server/engine/config.go
@@ -9,6 +9,7 @@ package engine
 import (
 	"fmt"
 	"os"
+	"strconv"
 	"strings"
 
 	"github.com/pkg/errors"
@@ -28,6 +29,8 @@ const (
 	envLogMasks      = "D_LOG_MASK"
 	envLogDbgStreams = "DD_MASK"
 	envLogSubsystems = "DD_SUBSYS"
+
+	minABTThreadStackSizeDCPM = 20480
 )
 
 // FabricConfig encapsulates networking fabric configuration.
@@ -342,7 +345,80 @@ func (c *Config) Validate() error {
 	if err := ValidateLogSubsystems(subsystems); err != nil {
 		return errors.Wrap(err, "validate engine log subsystems")
 	}
+	return nil
+}
+
+// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
+func (c *Config) UpdatePMDKEnvarsStackSizeDCPM() error {
+	stackSizeStr, err := c.GetEnvVar("ABT_THREAD_STACKSIZE")
+	if err != nil {
+		c.EnvVars = append(c.EnvVars, fmt.Sprintf("ABT_THREAD_STACKSIZE=%d",
+			minABTThreadStackSizeDCPM))
+		return nil
+	}
+	// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
+	stackSizeValue, err := strconv.Atoi(stackSizeStr)
+	if err != nil {
+		return errors.Errorf("env_var ABT_THREAD_STACKSIZE has invalid value: %s",
+			stackSizeStr)
+	}
+	if stackSizeValue < minABTThreadStackSizeDCPM {
+		return errors.Errorf("env_var ABT_THREAD_STACKSIZE should be >= %d "+
+			"for DCPM storage class, found %d", minABTThreadStackSizeDCPM,
+			stackSizeValue)
+	}
+	return nil
+}
+
+// Ensure proper configuration of shutdown (SDS) state
+func (c *Config) UpdatePMDKEnvarsPMemobjConf(isDCPM bool) error {
+	pmemobjConfStr, pmemobjConfErr := c.GetEnvVar("PMEMOBJ_CONF")
+	//also work for empty string
+	hasSdsAtCreate := strings.Contains(pmemobjConfStr, "sds.at_create")
+	if isDCPM {
+		if !hasSdsAtCreate {
+			return nil
+		}
+		// Confirm default handling of shutdown state (SDS) for DCPM storage class.
+		return errors.New("env_var PMEMOBJ_CONF should NOT contain 'sds.at_create=?' " +
+			"for DCPM storage class, found '" + pmemobjConfStr + "'")
+	}
+
+	// Disable shutdown state (SDS) (part of RAS) for RAM-based simulated SCM.
+	if pmemobjConfErr != nil {
+		c.EnvVars = append(c.EnvVars, "PMEMOBJ_CONF=sds.at_create=0")
+		return nil
+	}
+	if !hasSdsAtCreate {
+		envVars, _ := common.DeleteKeyValue(c.EnvVars, "PMEMOBJ_CONF")
+		c.EnvVars = append(envVars, "PMEMOBJ_CONF="+pmemobjConfStr+
+			";sds.at_create=0")
+		return nil
+	}
+	if strings.Contains(pmemobjConfStr, "sds.at_create=1") {
+		return errors.New("env_var PMEMOBJ_CONF should contain 'sds.at_create=0' " +
+			"for non-DCPM storage class, found '" + pmemobjConfStr + "'")
+	}
+	return nil
+}
+
+// Ensure proper environment variables for PMDK w/ NDCTL enabled based on
+// the actual configuration of the storage class.
+func (c *Config) UpdatePMDKEnvars() error {
+
+	if len(c.Storage.Tiers) == 0 {
+		return errors.New("Invalid config - no tier 0 defined")
+	}
+
+	isDCPM := c.Storage.Tiers[0].Class == storage.ClassDcpm
 
+	if err := c.UpdatePMDKEnvarsPMemobjConf(isDCPM); err != nil {
+		return err
+	}
+
+	if isDCPM {
+		return c.UpdatePMDKEnvarsStackSizeDCPM()
+	}
 	return nil
 }
 
@@ -690,3 +766,13 @@ func (c *Config) WithStorageIndex(i uint32) *Config {
 	c.Storage.EngineIdx = uint(i)
 	return c
 }
+
+// WithEnvVarAbtThreadStackSize sets environment variable ABT_THREAD_STACKSIZE.
+func (c *Config) WithEnvVarAbtThreadStackSize(stack_size uint16) *Config {
+	return c.WithEnvVars(fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", stack_size))
+}
+
+// WithEnvVarPMemObjSdsAtCreate sets PMEMOBJ_CONF env. var. to sds.at_create=0/1 value
+func (c *Config) WithEnvVarPMemObjSdsAtCreate(value uint8) *Config {
+	return c.WithEnvVars(fmt.Sprintf("PMEMOBJ_CONF=sds.at_create=%d", value))
+}
diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go
index 463e86567dcd..d9ca2bfebbd2 100644
--- a/src/control/server/engine/config_test.go
+++ b/src/control/server/engine/config_test.go
@@ -1104,3 +1104,210 @@ func TestFabricConfig_Update(t *testing.T) {
 		})
 	}
 }
+
+func TestConfig_UpdatePMDKEnvarsStackSizeDCPM(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg                   *Config
+		expErr                error
+		expABTthreadStackSize int
+	}{
+		"empty config should not fail": {
+			cfg:                   MockConfig(),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"valid config for DCPM should not fail": {
+			cfg:                   validConfig().WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"config for DCPM without thread size should not fail": {
+			cfg:                   validConfig(),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"config for DCPM with stack size big enough should not fail": {
+			cfg: validConfig().
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM + 1),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM + 1,
+		},
+		"config for DCPM with stack size too small should fail": {
+			cfg: validConfig().
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expErr: errors.New(fmt.Sprintf("env_var ABT_THREAD_STACKSIZE "+
+				"should be >= %d for DCPM storage class, found %d",
+				minABTThreadStackSizeDCPM, minABTThreadStackSizeDCPM-1)),
+		},
+		"config for DCPM with invalid ABT_THREAD_STACKSIZE value should fail": {
+			cfg:    validConfig().WithEnvVars("ABT_THREAD_STACKSIZE=foo_bar"),
+			expErr: errors.New("env_var ABT_THREAD_STACKSIZE has invalid value: foo_bar"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			err := tc.cfg.UpdatePMDKEnvarsStackSizeDCPM()
+			test.CmpErr(t, tc.expErr, err)
+			if err == nil {
+				stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
+				test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
+				stackSizeVal, err := strconv.Atoi(stackSizeStr)
+				test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
+				test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
+					"Invalid ABT_THREAD_STACKSIZE value")
+			}
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvarsPMemobjConfDCPM(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg    *Config
+		expErr error
+	}{
+		"empty config should not fail": {
+			cfg: MockConfig(),
+		},
+		"valid config for DCPM should not fail": {
+			cfg: validConfig(),
+		},
+		"config for DCPM with forced sds.at_create (1) should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
+				"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=1'"),
+		},
+		"config for DCPM with forced sds.at_create (0) should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0),
+			expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
+				"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=0'"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(true))
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvarsPMemobjConfNRam(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg             *Config
+		expErr          error
+		expPMEMOBJ_CONF string
+	}{
+		"empty config should not fail": {
+			cfg:             validConfig(),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"config for ram without PMEMOBJ_CONF should not fail": {
+			cfg:             MockConfig(),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"valid config for should not fail": {
+			cfg:             validConfig().WithEnvVarPMemObjSdsAtCreate(0),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"config for ram w/ PMEMOBJ_CONF w/o sds.at_create should should be updated": {
+			cfg:             validConfig().WithEnvVars("PMEMOBJ_CONF=foo_bar"),
+			expPMEMOBJ_CONF: "foo_bar;sds.at_create=0",
+		},
+		"config for ram with sds.at_create set to 1 should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class" +
+				", found 'sds.at_create=1'"),
+		},
+		"config for ram w/ PMEMOBJ_CONF w/ sds.at_create=1 should fail": {
+			cfg: validConfig().
+				WithEnvVars("PMEMOBJ_CONF=sds.at_create=1;foo-bar"),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class" +
+				", found 'sds.at_create=1;foo-bar'"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(false))
+			if len(tc.expPMEMOBJ_CONF) > 0 {
+				sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
+				test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
+				test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
+					"Invalid PMEMOBJ_CONF")
+			}
+
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvars(t *testing.T) {
+	validConfig := func(storageclas string) *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass(storageclas))
+	}
+	for name, tc := range map[string]struct {
+		cfg                   *Config
+		expErr                error
+		expPMEMOBJ_CONF       string
+		expABTthreadStackSize int
+	}{
+		"empty config should fail": {
+			cfg:                   MockConfig(),
+			expErr:                errors.New("Invalid config - no tier 0 defined"),
+			expABTthreadStackSize: -1,
+		},
+		"valid config for RAM should not fail": {
+			cfg: validConfig("ram").
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expPMEMOBJ_CONF:       "sds.at_create=0",
+			expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
+		},
+		"invalid config for RAM should fail": {
+			cfg: validConfig("ram").WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class, " +
+				"found 'sds.at_create=1'"),
+			expABTthreadStackSize: -1,
+		},
+		"valid config for DCPM should not fail": {
+			cfg:                   validConfig("dcpm"),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"invalid config for DCPM should not fail": {
+			cfg: validConfig("dcpm").
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expErr: errors.New("env_var ABT_THREAD_STACKSIZE should be >= 20480 " +
+				"for DCPM storage class, found 20479"),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			errTc := tc.cfg.UpdatePMDKEnvars()
+			test.CmpErr(t, tc.expErr, errTc)
+			if len(tc.expPMEMOBJ_CONF) > 0 {
+				sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
+				test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
+				test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
+					"Invalid PMEMOBJ_CONF")
+			}
+			if tc.expABTthreadStackSize >= 0 {
+				stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
+				test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
+				stackSizeVal, err := strconv.Atoi(stackSizeStr)
+				test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
+				test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
+					"Invalid ABT_THREAD_STACKSIZE value")
+			}
+		})
+	}
+}
diff --git a/src/control/server/server.go b/src/control/server/server.go
index 4ac8ce5e04be..c9232dd1725c 100644
--- a/src/control/server/server.go
+++ b/src/control/server/server.go
@@ -103,6 +103,12 @@ func processConfig(log logging.Logger, cfg *config.Server, fis *hardware.FabricI
 		return err
 	}
 
+	for _, ec := range cfg.Engines {
+		if err := ec.UpdatePMDKEnvars(); err != nil {
+			return err
+		}
+	}
+
 	return nil
 }
 
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index b03215d08aac..e1d6ee79bce8 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -1180,7 +1180,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 		/* A zero size accommodates the existing file */
 		vpa.vpa_scm_size = 0;
 		vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr;
-		rc = dss_thread_collective(tgt_vos_create_one, &vpa, 0);
+		rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK);
 		if (rc) {
 			D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n",
 				DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
diff --git a/utils/build.config b/utils/build.config
index 080ebae12830..52749b3f1ce5 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -4,7 +4,7 @@ component=daos
 [commit_versions]
 argobots=v1.1
 fuse=fuse-3.16.2
-pmdk=2.0.0
+pmdk=2.1.0
 isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
@@ -30,3 +30,4 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497
 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
 mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
+pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc
index 09022a74e59b..bbf0f9e2b925 100644
--- a/utils/rpms/daos.rpmlintrc
+++ b/utils/rpms/daos.rpmlintrc
@@ -47,10 +47,10 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a"
 addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so")
 
 # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected.
-addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel")
 
 # a functional test builds daos from source, so it needs the various *-devel packages for daos' build dependencies.
-addFilter("daos-client-tests.x86_64: E: devel-dependency capstone-devel")
-addFilter("daos-client-tests.x86_64: E: explicit-lib-dependency libcapstone-devel")
-addFilter("daos-client-tests.x86_64: E: devel-dependency libcapstone-devel")
-addFilter("daos-client-tests.x86_64: E: devel-dependency fuse3-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency capstone-devel")
+addFilter("daos-client-tests\.x86_64: E: explicit-lib-dependency lib(capstone|ndctl)-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency libcapstone-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency fuse3-devel")
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index 14fa74776a22..29576bc71a1b 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -15,7 +15,7 @@
 
 Name:          daos
 Version:       2.6.1
-Release:       3%{?relval}%{?dist}
+Release:       4%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -49,7 +49,7 @@ BuildRequires: libabt-devel >= 1.0rc1
 BuildRequires: libjson-c-devel
 BuildRequires: boost-devel
 %endif
-BuildRequires: libpmemobj-devel >= 2.0.0
+BuildRequires: libpmemobj-devel >= 2.1.0
 %if (0%{?rhel} >= 8)
 BuildRequires: fuse3-devel >= 3
 %else
@@ -147,11 +147,11 @@ Requires: ndctl
 # needed to set PMem configuration goals in BIOS through control-plane
 %if (0%{?suse_version} >= 1500)
 Requires: ipmctl >= 03.00.00.0423
-Requires: libpmemobj1 >= 2.0.0-1.suse1500
+Requires: libpmemobj1 >= 2.1.0-1.suse1500
 Requires: libfabric1 >= %{libfabric_version}
 %else
 Requires: ipmctl >= 03.00.00.0468
-Requires: libpmemobj >= 2.0.0-1%{?dist}
+Requires: libpmemobj >= 2.1.0-1%{?dist}
 %endif
 Requires: libfabric >= %{libfabric_version}
 Requires: mercury >= %{mercury_version}
@@ -232,6 +232,13 @@ Requires: fuse3-devel >= 3
 Requires: fuse3-devel >= 3.4.2
 %endif
 Requires: pciutils-devel
+%if (0%{?suse_version} > 0)
+Requires: libndctl-devel
+%endif
+%if (0%{?rhel} >= 8)
+Requires: ndctl-devel
+Requires: daxctl-devel
+%endif
 
 %description client-tests
 This is the package needed to run the DAOS test suite (client tests)
@@ -591,6 +598,20 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 # No files in a shim package
 
 %changelog
+
+* Wed Oct 02 2024 Tomasz Gromadzki <tomasz.gromadzki@intel.com> 2.6.1-4
+- Add support of the PMDK package 2.1.0 with NDCTL enabled.
+  * Increase the default ULT stack size to 20KiB if the engine uses
+    the DCPM storage class.
+  * Prevent using the RAM storage class (simulated PMem) when
+    the shutdown state (SDS) is active.
+    * Automatically disable SDS for the RAM storage class on engine startup.
+    * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0'
+      environment variable to deactivate SDS for the DAOS tools
+      (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM.
+      Otherwise, a user is supposed to be stopped by an error
+      like: "Unsafe shutdown count is not supported for this source".
+
 * Tue Oct 01 2024 Phillip Henderson <phillip.henderson@intel.com> 2.6.1-3
 - Third release candidate for 2.6.1
 
diff --git a/utils/run_utest.py b/utils/run_utest.py
index c2f4ffd002dd..1835f230e362 100755
--- a/utils/run_utest.py
+++ b/utils/run_utest.py
@@ -440,6 +440,7 @@ def run(self, base, memcheck, sudo):
             cmd = new_cmd
         self.last = cmd
 
+        self.env.update({"PMEMOBJ_CONF": "sds.at_create=0"})
         if self.suite.gha:
             retval = run_cmd(cmd, env=self.env)
         else:
diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh
index 81a044bfddb0..472f88c99255 100755
--- a/utils/scripts/install-el8.sh
+++ b/utils/scripts/install-el8.sh
@@ -20,6 +20,7 @@ dnf --nodocs install \
     clang-tools-extra \
     cmake \
     CUnit-devel \
+    daxctl-devel \
     diffutils \
     e2fsprogs \
     file \
@@ -48,6 +49,7 @@ dnf --nodocs install \
     lz4-devel \
     make \
     ndctl \
+    ndctl-devel \
     numactl \
     numactl-devel \
     openmpi-devel \
diff --git a/utils/scripts/install-el9.sh b/utils/scripts/install-el9.sh
index 9ddd8c257d60..092de8eba0f0 100755
--- a/utils/scripts/install-el9.sh
+++ b/utils/scripts/install-el9.sh
@@ -18,6 +18,7 @@ dnf --nodocs install \
     clang-tools-extra \
     cmake \
     CUnit-devel \
+    daxctl-devel \
     diffutils \
     e2fsprogs \
     file \
@@ -47,6 +48,7 @@ dnf --nodocs install \
     lz4-devel \
     make \
     ndctl \
+    ndctl-devel \
     numactl \
     numactl-devel \
     openmpi-devel \
diff --git a/utils/scripts/install-leap15.sh b/utils/scripts/install-leap15.sh
index fc9826ce508d..0eb8ef44fee4 100755
--- a/utils/scripts/install-leap15.sh
+++ b/utils/scripts/install-leap15.sh
@@ -38,6 +38,7 @@ dnf --nodocs install \
     libjson-c-devel \
     libltdl7 \
     liblz4-devel \
+    libndctl-devel \
     libnuma-devel \
     libopenssl-devel \
     libprotobuf-c-devel \
diff --git a/utils/scripts/install-ubuntu.sh b/utils/scripts/install-ubuntu.sh
index ec21c92f474c..a36641d5f101 100755
--- a/utils/scripts/install-ubuntu.sh
+++ b/utils/scripts/install-ubuntu.sh
@@ -29,11 +29,13 @@ apt-get install \
     libcapstone-dev \
     libcmocka-dev \
     libcunit1-dev \
+    libdaxctl-dev \
     libfuse3-dev \
     libhwloc-dev \
     libibverbs-dev \
     libjson-c-dev \
     liblz4-dev \
+    libndctl-dev \
     libnuma-dev \
     libopenmpi-dev \
     libpci-dev \

From 60d4b5dc682463eb61f829b415cc31ea0b993ebc Mon Sep 17 00:00:00 2001
From: Li Wei <wei.g.li@intel.com>
Date: Fri, 18 Oct 2024 17:39:03 +0900
Subject: [PATCH 05/26] DAOS-16653 pool: Batch crt events (#15230) (#15302)

* DAOS-16653 pool: Batch crt events

When multiple engines become unavailable around the same time, if a pool
cannot tolerate the unavailability of those engines, it is sometimes
desired that the pool would not exclude any of the engines. Hence, this
patch introduces a CaRT event delay, tunable via the server-side
environment variable, CRT_EVENT_DELAY, so that the events signaling the
unavailability of those engines will be handled in hopefully one batch,
giving pool_svc_update_map_internal a chance to reject the pool map
update based on the RF check.

When the RF check rejects a pool map change, we should revisit the
corresponding events later, rather than simply throwing them away. This
patch improves this case by returning the events back to the event
queue, and pause the queue handling until next new event or pool map
update.

  - Introduce event sets: pool_svc_event_set. Now the event queue can be
    simplified to just one event set.

  - Add the ability to pause and resume the event handling: pse_paused.

  - Track the time when the latest event was queued: pse_time.

Signed-off-by: Li Wei <wei.g.li@intel.com>
---
 docs/admin/env_variables.md                 |   1 +
 src/pool/srv_pool.c                         | 454 +++++++++++++++-----
 src/tests/ftest/util/server_utils_params.py |   1 +
 3 files changed, 346 insertions(+), 110 deletions(-)

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 060c3790d579..31e0f3e87672 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 667e4bc6ed69..2d80088d4b6b 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version)
 
 /* Pool service crt event */
 struct pool_svc_event {
-	d_list_t		psv_link;
 	d_rank_t		psv_rank;
 	uint64_t		psv_incarnation;
 	enum crt_event_source	psv_src;
@@ -72,15 +71,40 @@ struct pool_svc_event {
 #define DF_PS_EVENT	"rank=%u inc="DF_U64" src=%d type=%d"
 #define DP_PS_EVENT(e)	e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type
 
-#define RECHOOSE_SLEEP_MS 250
+/*
+ * Pool service crt event set
+ *
+ * This stores an unordered array of pool_svc_event objects. For all different
+ * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank.
+ *
+ * An event set facilitates the merging of a sequence of events. For instance,
+ * sequence (in the format <rank, type>)
+ *   <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D>
+ * will merge into set
+ *   <3, D>, <5, A>, <1, D>
+ * (that is, during the merge, an event overrides a previuos event of the same
+ * rank in the set).
+ */
+struct pool_svc_event_set {
+	struct pool_svc_event *pss_buf;
+	uint32_t               pss_len;
+	uint32_t               pss_cap;
+};
+
+#define DF_PS_EVENT_SET    "len=%u"
+#define DP_PS_EVENT_SET(s) s->pss_len
 
 /* Pool service crt-event-handling state */
 struct pool_svc_events {
-	ABT_mutex		pse_mutex;
-	ABT_cond		pse_cv;
-	d_list_t		pse_queue;
-	ABT_thread		pse_handler;
-	bool			pse_stop;
+	ABT_mutex                  pse_mutex;
+	ABT_cond                   pse_cv;
+	struct pool_svc_event_set *pse_pending;
+	uint64_t                   pse_timeout; /* s */
+	uint64_t                   pse_time;    /* s */
+	struct sched_request      *pse_timer;
+	ABT_thread                 pse_handler;
+	bool                       pse_stop;
+	bool                       pse_paused;
 };
 
 /* Pool service schedule state */
@@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path)
 	return 0;
 }
 
+static unsigned int
+get_crt_event_delay(void)
+{
+	unsigned int t = 10 /* s */;
+
+	d_getenv_uint("CRT_EVENT_DELAY", &t);
+	return t;
+}
+
 static int
 pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 {
@@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 	d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t));
 
 	uuid_copy(svc->ps_uuid, id->iov_buf);
-	D_INIT_LIST_HEAD(&svc->ps_events.pse_queue);
+	svc->ps_events.pse_timeout = get_crt_event_delay();
 	svc->ps_events.pse_handler = ABT_THREAD_NULL;
 	svc->ps_svc_rf = -1;
 	svc->ps_force_notify = false;
@@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void)
 	pool_disable_exclude = false;
 }
 
+static int
+alloc_event_set(struct pool_svc_event_set **event_set)
+{
+	D_ALLOC_PTR(*event_set);
+	if (*event_set == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+free_event_set(struct pool_svc_event_set **event_set)
+{
+	D_FREE((*event_set)->pss_buf);
+	D_FREE(*event_set);
+}
+
+static int
+add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation,
+		 enum crt_event_source src, enum crt_event_type type)
+{
+	int i;
+
+	/* Find rank in event_set. */
+	for (i = 0; i < event_set->pss_len; i++)
+		if (event_set->pss_buf[i].psv_rank == rank)
+			break;
+
+	/* If not found, prepare to add a new event. */
+	if (i == event_set->pss_len) {
+		if (event_set->pss_len == event_set->pss_cap) {
+			uint32_t               cap;
+			struct pool_svc_event *buf;
+
+			if (event_set->pss_cap == 0)
+				cap = 1;
+			else
+				cap = 2 * event_set->pss_cap;
+			D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap);
+			if (buf == NULL)
+				return -DER_NOMEM;
+			event_set->pss_buf = buf;
+			event_set->pss_cap = cap;
+		}
+		event_set->pss_len++;
+	}
+
+	event_set->pss_buf[i].psv_rank        = rank;
+	event_set->pss_buf[i].psv_incarnation = incarnation;
+	event_set->pss_buf[i].psv_src         = src;
+	event_set->pss_buf[i].psv_type        = type;
+	return 0;
+}
+
+/* Merge next into prev. */
+static int
+merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next)
+{
+	int i;
+
+	for (i = 0; i < next->pss_len; i++) {
+		struct pool_svc_event *event = &next->pss_buf[i];
+		int                    rc;
+
+		rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src,
+				      event->psv_type);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
 static int
 queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src,
 	    enum crt_event_type type)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	struct pool_svc_event  *event;
+	int                     rc;
+	bool                    allocated = false;
 
-	D_ALLOC_PTR(event);
-	if (event == NULL)
-		return -DER_NOMEM;
+	D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank,
+		incarnation, src, type);
 
-	event->psv_rank = rank;
-	event->psv_incarnation = incarnation;
-	event->psv_src = src;
-	event->psv_type = type;
+	ABT_mutex_lock(events->pse_mutex);
 
-	D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	if (events->pse_pending == NULL) {
+		rc = alloc_event_set(&events->pse_pending);
+		if (rc != 0)
+			goto out;
+		allocated = true;
+	}
+
+	rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type);
+	if (rc != 0)
+		goto out;
+
+	events->pse_time = daos_gettime_coarse();
+
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+	}
 
-	ABT_mutex_lock(events->pse_mutex);
-	d_list_add_tail(&event->psv_link, &events->pse_queue);
 	ABT_cond_broadcast(events->pse_cv);
+
+out:
+	if (rc != 0 && allocated)
+		free_event_set(&events->pse_pending);
 	ABT_mutex_unlock(events->pse_mutex);
-	return 0;
+	return rc;
 }
 
 static void
-discard_events(d_list_t *queue)
+resume_event_handling(struct pool_svc *svc)
 {
-	struct pool_svc_event  *event;
-	struct pool_svc_event  *tmp;
+	struct pool_svc_events *events = &svc->ps_events;
 
-	d_list_for_each_entry_safe(event, tmp, queue, psv_link) {
-		D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event));
-		d_list_del_init(&event->psv_link);
-		D_FREE(event);
+	ABT_mutex_lock(events->pse_mutex);
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+		ABT_cond_broadcast(events->pse_cv);
 	}
+	ABT_mutex_unlock(events->pse_mutex);
 }
 
-static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank);
+static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set);
 
-static void
-handle_event(struct pool_svc *svc, struct pool_svc_event *event)
+static int
+handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
+	int i;
 	int rc;
 
-	if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) ||
-	    (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) {
-		D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event),
-			pool_disable_exclude);
-		goto out;
+	D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid),
+	       DP_PS_EVENT_SET(event_set));
+
+	if (!pool_disable_exclude) {
+		rc = pool_svc_exclude_ranks(svc, event_set);
+		if (rc != 0) {
+			DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid));
+			return rc;
+		}
 	}
 
-	if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD &&
-	    event->psv_type == CRT_EVT_DEAD) {
-		D_DEBUG(DB_MD, "ignore exclusion of self\n");
-		goto out;
+	/*
+	 * Check if the alive ranks are up in the pool map. If in the future we
+	 * add automatic reintegration below, for instance, we may need
+	 * to not only take svc->ps_lock, but also employ an RDB TX by
+	 * the book.
+	 */
+	ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE)
+			continue;
+		if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) {
+			/*
+			 * The rank is up in the pool map. Request a pool map
+			 * distribution just in case the rank has recently
+			 * restarted and does not have a copy of the pool map.
+			 */
+			ds_rsvc_request_map_dist(&svc->ps_rsvc);
+			D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n",
+				DP_UUID(svc->ps_uuid), event->psv_rank);
+			break;
+		}
 	}
+	ABT_rwlock_unlock(svc->ps_pool->sp_lock);
 
-	D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	return 0;
+}
 
-	if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) {
-		/*
-		 * Check if the rank is up in the pool map. If in the future we
-		 * add automatic reintegration below, for instance, we may need
-		 * to not only take svc->ps_lock, but also employ an RDB TX by
-		 * the book.
-		 */
-		ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
-		rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank);
-		ABT_rwlock_unlock(svc->ps_pool->sp_lock);
-		if (!rc)
-			goto out;
+struct event_timer_arg {
+	struct pool_svc_events *eta_events;
+	uint64_t                eta_deadline;
+};
 
-		/*
-		 * The rank is up in the pool map. Request a pool map
-		 * distribution just in case the rank has recently restarted
-		 * and does not have a copy of the pool map.
-		 */
-		ds_rsvc_request_map_dist(&svc->ps_rsvc);
-		D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n",
-			DP_UUID(svc->ps_uuid), event->psv_rank);
-	} else if (event->psv_type == CRT_EVT_DEAD) {
-		rc = pool_svc_exclude_rank(svc, event->psv_rank);
-		if (rc != 0)
-			D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n",
-				DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc));
-	}
+static void
+event_timer(void *varg)
+{
+	struct event_timer_arg *arg = varg;
+	struct pool_svc_events *events = arg->eta_events;
+	int64_t                 time_left = arg->eta_deadline - daos_gettime_coarse();
 
-out:
-	return;
+	if (time_left > 0)
+		sched_req_sleep(events->pse_timer, time_left * 1000);
+	ABT_cond_broadcast(events->pse_cv);
+}
+
+static int
+start_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+	uuid_t                  uuid;
+	struct sched_req_attr   attr;
+
+	D_ASSERT(events->pse_timer == NULL);
+	uuid_clear(uuid);
+	sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid);
+	events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0);
+	if (events->pse_timer == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+stop_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+
+	D_ASSERT(events->pse_timer != NULL);
+	sched_req_wait(events->pse_timer, true /* abort */);
+	sched_req_put(events->pse_timer);
+	events->pse_timer = NULL;
 }
 
 static void
@@ -1403,31 +1559,83 @@ events_handler(void *arg)
 	D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid));
 
 	for (;;) {
-		struct pool_svc_event  *event = NULL;
-		bool			stop;
+		struct pool_svc_event_set *event_set = NULL;
+		bool                       stop;
+		int                        rc;
 
 		ABT_mutex_lock(events->pse_mutex);
 		for (;;) {
+			struct event_timer_arg timer_arg;
+			int64_t                time_left;
+
 			stop = events->pse_stop;
 			if (stop) {
-				discard_events(&events->pse_queue);
+				events->pse_paused = false;
+				if (events->pse_pending != NULL)
+					free_event_set(&events->pse_pending);
 				break;
 			}
-			if (!d_list_empty(&events->pse_queue)) {
-				event = d_list_entry(events->pse_queue.next, struct pool_svc_event,
-						     psv_link);
-				d_list_del_init(&event->psv_link);
+
+			timer_arg.eta_events   = events;
+			timer_arg.eta_deadline = events->pse_time + events->pse_timeout;
+
+			time_left = timer_arg.eta_deadline - daos_gettime_coarse();
+			if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) {
+				event_set = events->pse_pending;
+				events->pse_pending = NULL;
 				break;
 			}
+
+			/* A simple timed cond_wait without polling. */
+			if (time_left > 0) {
+				rc = start_event_timer(&timer_arg);
+				if (rc != 0) {
+					/* No delay then. */
+					DL_ERROR(rc, DF_UUID ": failed to start event timer",
+						 DP_UUID(svc->ps_uuid));
+					events->pse_time = 0;
+					continue;
+				}
+			}
 			sched_cond_wait(events->pse_cv, events->pse_mutex);
+			if (time_left > 0)
+				stop_event_timer(&timer_arg);
 		}
 		ABT_mutex_unlock(events->pse_mutex);
 		if (stop)
 			break;
 
-		handle_event(svc, event);
+		rc = handle_event(svc, event_set);
+		if (rc != 0) {
+			/* Put event_set back to events->pse_pending. */
+			D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid));
+			ABT_mutex_lock(events->pse_mutex);
+			if (events->pse_pending == NULL) {
+				/*
+				 * No pending events; pause the handling until
+				 * next event or pool map change.
+				 */
+				D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n",
+					DP_UUID(svc->ps_uuid));
+				events->pse_paused = true;
+			} else {
+				/*
+				 * There are pending events; do not pause the
+				 * handling.
+				 */
+				rc = merge_event_sets(event_set, events->pse_pending);
+				if (rc != 0)
+					DL_ERROR(rc, DF_UUID ": failed to merge events",
+						 DP_UUID(svc->ps_uuid));
+				free_event_set(&events->pse_pending);
+			}
+			events->pse_pending = event_set;
+			event_set = NULL;
+			ABT_mutex_unlock(events->pse_mutex);
+		}
 
-		D_FREE(event);
+		if (event_set != NULL)
+			free_event_set(&event_set);
 		ABT_thread_yield();
 	}
 
@@ -1437,7 +1645,7 @@ events_handler(void *arg)
 static bool
 events_pending(struct pool_svc *svc)
 {
-	return !d_list_empty(&svc->ps_events.pse_queue);
+	return svc->ps_events.pse_pending != NULL;
 }
 
 static void
@@ -1461,9 +1669,11 @@ init_events(struct pool_svc *svc)
 	struct pool_svc_events *events = &svc->ps_events;
 	int			rc;
 
-	D_ASSERT(d_list_empty(&events->pse_queue));
+	D_ASSERT(events->pse_pending == NULL);
+	D_ASSERT(events->pse_timer == NULL);
 	D_ASSERT(events->pse_handler == ABT_THREAD_NULL);
-	D_ASSERT(events->pse_stop == false);
+	D_ASSERT(!events->pse_stop);
+	D_ASSERT(!events->pse_paused);
 
 	if (!ds_pool_skip_for_check(svc->ps_pool)) {
 		rc = crt_register_event_cb(ds_pool_crt_event_cb, svc);
@@ -1498,7 +1708,8 @@ init_events(struct pool_svc *svc)
 err_cb:
 	if (!ds_pool_skip_for_check(svc->ps_pool))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
-	discard_events(&events->pse_queue);
+	if (events->pse_pending != NULL)
+		free_event_set(&events->pse_pending);
 err:
 	return rc;
 }
@@ -1507,7 +1718,6 @@ static void
 fini_events(struct pool_svc *svc)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	int			rc;
 
 	D_ASSERT(events->pse_handler != ABT_THREAD_NULL);
 
@@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc)
 	ABT_cond_broadcast(events->pse_cv);
 	ABT_mutex_unlock(events->pse_mutex);
 
-	rc = ABT_thread_join(events->pse_handler);
-	D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc));
 	ABT_thread_free(&events->pse_handler);
 	events->pse_handler = ABT_THREAD_NULL;
 	events->pse_stop = false;
@@ -4332,7 +4540,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks,
 		list_cont_bulk_destroy(bulk);
 		D_FREE(resp_cont);
 		crt_req_decref(rpc);
-		dss_sleep(RECHOOSE_SLEEP_MS);
+		dss_sleep(250);
 		D_GOTO(rechoose, rc);
 	}
 
@@ -6501,7 +6709,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma
 }
 
 static int
-pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
+pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map)
 {
 	crt_group_t		*primary_grp;
 	struct pool_domain	*doms;
@@ -6517,13 +6725,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
 
 	D_CRIT("!!! Please try to recover these engines in top priority -\n");
 	D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n");
-	D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n",
-	       DP_UUID(svc->ps_uuid), rank);
 	for (i = 0; i < doms_cnt; i++) {
 		struct swim_member_state state;
 
-		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) ||
-		    (doms[i].do_comp.co_rank == rank))
+		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN))
 			continue;
 
 		rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state);
@@ -6701,8 +6906,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 	 * with CRIT log message to ask administrator to bring back the engine.
 	 */
 	if (src == MUS_SWIM && opc == MAP_EXCLUDE) {
-		d_rank_t	rank;
-		int		failed_cnt;
+		int failed_cnt;
 
 		rc = pool_map_update_failed_cnt(map);
 		if (rc != 0) {
@@ -6711,15 +6915,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 			goto out_map;
 		}
 
-		D_ASSERT(tgt_addrs->pta_number == 1);
-		rank = tgt_addrs->pta_addrs->pta_rank;
 		failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE);
-		D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n",
-		       DP_UUID(svc->ps_uuid), rank, failed_cnt);
+		D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n",
+		       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt);
 		if (failed_cnt > pw_rf) {
-			D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n",
-			       DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt);
-			rc = pool_map_crit_prompt(svc, map, rank);
+			D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n",
+			       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt);
+			ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+			rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map);
+			ABT_rwlock_unlock(svc->ps_pool->sp_lock);
+			if (rc != 0)
+				DL_ERROR(rc, DF_UUID ": failed to log prompt",
+					 DP_UUID(svc->ps_uuid));
+			rc = -DER_RF;
 			goto out_map;
 		}
 	}
@@ -6768,6 +6976,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 
 	ds_rsvc_request_map_dist(&svc->ps_rsvc);
 
+	/* See events_handler. */
+	resume_event_handling(svc);
+
 	rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */);
 	if (rc != 0) {
 		DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration",
@@ -7167,28 +7378,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc)
 }
 
 static int
-pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank)
+pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
 	struct pool_target_addr_list	list;
 	struct pool_target_addr_list	inval_list_out = { 0 };
-	struct pool_target_addr		tgt_rank;
+	struct pool_target_addr	       *addrs;
+	d_rank_t			self_rank = dss_self_rank();
 	uint32_t			map_version = 0;
+	int				n = 0;
+	int				i;
 	int				rc;
 
-	tgt_rank.pta_rank = rank;
-	tgt_rank.pta_target = -1;
-	list.pta_number = 1;
-	list.pta_addrs = &tgt_rank;
+	D_ALLOC_ARRAY(addrs, event_set->pss_len);
+	if (addrs == NULL)
+		return -DER_NOMEM;
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_type != CRT_EVT_DEAD)
+			continue;
+		if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) {
+			D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n",
+				DP_UUID(svc->ps_uuid));
+			continue;
+		}
+		addrs[n].pta_rank   = event->psv_rank;
+		addrs[n].pta_target = -1;
+		n++;
+	}
+	if (n == 0) {
+		rc = 0;
+		goto out;
+	}
+	list.pta_number = n;
+	list.pta_addrs  = addrs;
 
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */,
 				 NULL, NULL, 0, &list, &inval_list_out, &map_version,
 				 NULL /* hint */, MUS_SWIM);
 
-	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n",
-		DP_UUID(svc->ps_uuid), map_version, rank, rc);
+	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n",
+		DP_UUID(svc->ps_uuid), map_version, n, rc);
 
 	pool_target_addr_list_free(&inval_list_out);
-
+out:
+	D_FREE(addrs);
 	return rc;
 }
 
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 440ffe68f823..7cda958d2423 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters):
         "common": [
             "D_LOG_FILE_APPEND_PID=1",
             "DAOS_POOL_RF=4",
+            "CRT_EVENT_DELAY=1",
             "COVFILE=/tmp/test.cov"],
         "ofi+tcp": [],
         "ofi+tcp;ofi_rxm": [],

From e0f58831ef4b7de01bc0abb4be85d0635a85f3cd Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Fri, 18 Oct 2024 06:48:22 -0700
Subject: [PATCH 06/26] DAOS-16720 cq: pin isort to v1.1.0 (#15338) (#15339)

Pin isort to v1.1.0 to avoid suprprise changes and because v1.1.1 is not
working for us.

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
---
 .github/workflows/linting.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 5fce435168a3..67c57bd31545 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d  # v5.1.0
         with:
           python-version: '3'
-      - uses: isort/isort-action@master
+      - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5  # v1.1.0
         with:
           requirementsFiles: "requirements.txt"
       - name: Run on SConstruct file.

From cb9d278be5b6730f294d65b74db17e4a948a9a23 Mon Sep 17 00:00:00 2001
From: Ken Cain <kenneth.c.cain@intel.com>
Date: Sat, 19 Oct 2024 02:53:37 -0400
Subject: [PATCH 07/26] DAOS-15852 test: more timing samples for
 co_op_dup_timing() (#14497) (#15324)

Rarely, this test will produce timings that exceed the failure
threshold. Local and PR/CI experiments have shown that increasing
the test's NUM_OPS to more than 200 iterations greatly reduces
or may eliminate such intermittent timing failures, by "spreading out"
the magnitude of the time spent in the 3 main loops of the test
(0% loops perform fault injections, 33%, and 50%).


Signed-off-by: Kenneth Cain <kenneth.c.cain@intel.com>
---
 src/tests/suite/daos_container.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c
index 958f9fd1d165..243a829c3176 100644
--- a/src/tests/suite/daos_container.c
+++ b/src/tests/suite/daos_container.c
@@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state)
 	size_t const       in_sizes[]        = {strlen(in_values[0]), strlen(in_values[1])};
 	int                n                 = (int)ARRAY_SIZE(names);
 	const unsigned int NUM_FP            = 3;
-	const uint32_t     NUM_OPS           = 200;
+	const uint32_t     NUM_OPS           = 500;
 	uint32_t           num_failures      = 0;
 	const uint32_t     SVC_OPS_ENABLED   = 1;
 	const uint32_t     SVC_OPS_ENTRY_AGE = 60;

From f8682fb56bc0d7928eb807f45599a5d9e9ada5de Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Sun, 20 Oct 2024 21:55:19 +0800
Subject: [PATCH 08/26] DAOS-16572 rebuild: properly assign
 global_dtx_resync_version in IV - b26 (#15186)

In rebuild_iv_ent_refresh() for refreshing DTX resync version, needs
to assign rt_global_dtx_resync_version firstly before wakeup related
rebuild_scan_leader.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/rebuild/rebuild_iv.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c
index cc585e037e13..f4b9f75f4078 100644
--- a/src/rebuild/rebuild_iv.c
+++ b/src/rebuild/rebuild_iv.c
@@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 	struct rebuild_tgt_pool_tracker *rpt;
 	struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf;
 	struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf;
+	uint32_t old_ver;
 	int rc = 0;
 
 	rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver,
@@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 			       dst_iv->riv_stable_epoch);
 		rpt->rt_global_done = dst_iv->riv_global_done;
 		rpt->rt_global_scan_done = dst_iv->riv_global_scan_done;
-		if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver &&
+		old_ver = rpt->rt_global_dtx_resync_version;
+		if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version)
+			rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
+		if (old_ver < rpt->rt_rebuild_ver &&
 		    dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) {
 			D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n",
-			       DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version,
+			       DP_UUID(src_iv->riv_pool_uuid), old_ver,
 			       dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver);
 			ABT_mutex_lock(rpt->rt_lock);
 			ABT_cond_signal(rpt->rt_global_dtx_wait_cond);
 			ABT_mutex_unlock(rpt->rt_lock);
 		}
-		rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
 	}
 
 out:

From 81e57d0ec818c1f117afa97418d1f126bc528952 Mon Sep 17 00:00:00 2001
From: Jeff Olivier <jeffolivier@google.com>
Date: Mon, 21 Oct 2024 10:10:03 -0600
Subject: [PATCH 09/26] DAOS-16716 ci: Set reference build for PRs (#15337)

Release branch PRs should use the release branch build
instead of master branch build for NLT reference

Signed-off-by: Jeff Olivier <jeffolivier@google.com>
---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 63c2dc28ae13..13ccc5128950 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -801,6 +801,7 @@ pipeline {
                             unitTestPost artifacts: ['nlt_logs/'],
                                          testResults: 'nlt-junit.xml',
                                          always_script: 'ci/unit/test_nlt_post.sh',
+					 referenceJobName: 'daos-stack/daos/release%252F2.6',
                                          valgrind_stash: 'el8-gcc-nlt-memcheck'
                             recordIssues enabledForFailure: true,
                                          failOnError: false,

From c8213799c4c14ad23ab6ef87c386b5e1d16ec089 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Tue, 22 Oct 2024 02:14:44 +0800
Subject: [PATCH 10/26] DAOS-16329 chk: maintenance mode after checking pool
 with dryrun - b26 (#14985)

Sometimes, after system shutdown unexpectedly, the users may expect
to check their critical data under some kind of maintenance mode.
Under such mode, no user data can be modified or moved or aggregated.
That will guarantee no further potential (DAOS logic caused) damage
can happen during the check.

For such purpose, we will enhance current DAOS CR logic with --dryrun
option to allow the pool (after check) to be opened as immutable with
disabling some mechanism that may potentially cause data modification
or movement (such as rebuild or aggregation).

Under such mode, if client wants to connect to the pool, the read-only
option must be specified. Similarly for opening container in such pool.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/chk/chk_common.c            |   2 +-
 src/chk/chk_engine.c            |  13 +--
 src/chk/chk_internal.h          |   1 +
 src/client/dfs/cont.c           |   4 +-
 src/container/srv_container.c   |  32 +++++--
 src/container/srv_target.c      |  10 ++-
 src/include/daos/container.h    |   4 +-
 src/include/daos_cont.h         |   3 +
 src/include/daos_srv/pool.h     |   7 +-
 src/include/daos_srv/security.h |  15 +++-
 src/mgmt/srv_target.c           |   2 +-
 src/pool/srv_internal.h         |   6 ++
 src/pool/srv_pool.c             |  82 +++++++++++++-----
 src/pool/srv_target.c           |  16 ++--
 src/rebuild/srv.c               |   2 +-
 src/security/srv_acl.c          |   8 +-
 src/tests/suite/daos_cr.c       | 144 ++++++++++++++++++++++++++++++++
 17 files changed, 297 insertions(+), 54 deletions(-)

diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c
index ace1c736791d..39821ec63287 100644
--- a/src/chk/chk_common.c
+++ b/src/chk/chk_common.c
@@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr)
 		if (cpr->cpr_started)
 			chk_pool_shutdown(cpr, true);
 
-		rc = ds_pool_start_after_check(cpr->cpr_uuid);
+		rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable);
 		if (rc != 0) {
 			D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n",
 			       DP_UUID(cpr->cpr_uuid), DP_RC(rc));
diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c
index 56e6da3ad9b9..ad61af851ce5 100644
--- a/src/chk/chk_engine.c
+++ b/src/chk/chk_engine.c
@@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args)
 	}
 
 	rc = chk_engine_cont_cleanup(cpr, svc, &aggregator);
-	if (rc != 0)
-		goto out;
-
-	rc = ds_pool_svc_schedule_reconf(svc);
+	if (rc == 0 && !cpr->cpr_immutable)
+		rc = ds_pool_svc_schedule_reconf(svc);
 
 out:
 	chk_engine_cont_list_fini(&aggregator);
@@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins)
 		if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE)
 			continue;
 
+		if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN)
+			cpr->cpr_immutable = 1;
+		else
+			cpr->cpr_immutable = 0;
+
 		if (phase > pool_cbk->cb_phase)
 			phase = pool_cbk->cb_phase;
 
@@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags)
 	cbk = &cpr->cpr_bk;
 	chk_pool_get(cpr);
 
-	rc = ds_pool_start(uuid, false);
+	rc = ds_pool_start(uuid, false, cpr->cpr_immutable);
 	if (rc != 0)
 		D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc));
 
diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h
index 86868e305ee9..9ab16b060b39 100644
--- a/src/chk/chk_internal.h
+++ b/src/chk/chk_internal.h
@@ -596,6 +596,7 @@ struct chk_pool_rec {
 				 cpr_stop:1,
 				 cpr_done:1,
 				 cpr_skip:1,
+				 cpr_immutable:1,
 				 cpr_dangling:1,
 				 cpr_for_orphan:1,
 				 cpr_notified_exit:1,
diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c
index 910e1819aa32..278229140564 100644
--- a/src/client/dfs/cont.c
+++ b/src/client/dfs/cont.c
@@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
 out_snap:
 	D_FREE(oit_args);
 	epr.epr_hi = epr.epr_lo = snap_epoch;
-	rc2                     = daos_cont_destroy_snap(coh, epr, NULL);
+	rc2 = daos_cont_destroy_snap(coh, epr, NULL);
+	if (rc2 != 0)
+		D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2));
 	if (rc == 0)
 		rc = daos_der2errno(rc2);
 out_dfs:
diff --git a/src/container/srv_container.c b/src/container/srv_container.c
index 372da43afe44..91d87d9b9781 100644
--- a/src/container/srv_container.c
+++ b/src/container/srv_container.c
@@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont,
 	 * - Users who can delete any container in the pool
 	 * - Users who have been given access to delete the specific container
 	 */
-	if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
-	    !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred,
-				    &owner, acl)) {
+	if (pool_hdl->sph_pool->sp_immutable ||
+	    (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
+	     !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) {
 		D_ERROR(DF_CONT": permission denied to delete cont\n",
 			DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid));
 		D_GOTO(out_prop, rc = -DER_NO_PERM);
@@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
+	if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable "
+			"container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc));
+		goto out;
+	}
+
 	/*
 	 * Need props to check for pool redundancy requirements and access
 	 * control.
@@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		D_GOTO(out, rc);
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags "
+		DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n",
+		DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+		DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas);
+
 	if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) {
 		D_ERROR(DF_CONT": permission denied evicting all handles\n",
 			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
@@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
-	if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) {
-		D_ERROR(DF_CONT": permission denied opening exclusively\n",
-			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
-		daos_prop_free(prop);
+	if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) ||
+	    ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) {
 		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container "
+			"with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas,
+			DP_RC(rc));
+		daos_prop_free(prop);
 		goto out;
 	}
 
diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index b0f3b693580a..a6d3d3c5286a 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -935,7 +935,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
 			cont_child->sc_stopping, cont_child->sc_destroying);
 		rc = -DER_SHUTDOWN;
 	} else if (!cont_child_started(cont_child)) {
-		if (!ds_pool_skip_for_check(pool_child->spc_pool)) {
+		if (!ds_pool_restricted(pool_child->spc_pool, false)) {
 			rc = cont_start_agg(cont_child);
 			if (rc != 0)
 				goto out;
@@ -1591,11 +1591,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 		 *     but for creating rebuild global container handle.
 		 */
 		D_ASSERT(hdl->sch_cont != NULL);
+		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
 		hdl->sch_cont->sc_open++;
 
 		if (hdl->sch_cont->sc_open > 1)
 			goto opened;
 
+		if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
+			goto csum_init;
+
 		rc = dtx_cont_open(hdl->sch_cont);
 		if (rc != 0) {
 			D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont "
@@ -1623,10 +1627,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 			D_GOTO(err_dtx, rc);
 		}
 
-		D_ASSERT(hdl->sch_cont != NULL);
-		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
+csum_init:
 		rc = ds_cont_csummer_init(hdl->sch_cont);
-
 		if (rc != 0)
 			D_GOTO(err_dtx, rc);
 	}
diff --git a/src/include/daos/container.h b/src/include/daos/container.h
index 282dfb8ec494..57ef06f63a24 100644
--- a/src/include/daos/container.h
+++ b/src/include/daos/container.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags)
 	f = flags;
 
 	/* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */
-	m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX);
+	m = f & DAOS_COO_IO_BASE_MASK;
 	if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX)
 		return false;
 
diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h
index c3bda9837d8b..632eafe81303 100644
--- a/src/include/daos_cont.h
+++ b/src/include/daos_cont.h
@@ -79,6 +79,9 @@ extern "C" {
 /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */
 #define DAOS_COO_MASK	((1U << DAOS_COO_NBITS) - 1)
 
+/** The basic IO mode: read-only, read-write or exclusively read-write. */
+#define DAOS_COO_IO_BASE_MASK   (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX)
+
 /** Maximum length for container hints */
 #define DAOS_CONT_HINT_MAX_LEN	128
 
diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h
index 6cbe3873f0ad..de22d55ed5d6 100644
--- a/src/include/daos_srv/pool.h
+++ b/src/include/daos_srv/pool.h
@@ -84,6 +84,7 @@ struct ds_pool {
 	uuid_t			sp_srv_pool_hdl;
 	uint32_t		sp_stopping:1,
 				sp_cr_checked:1,
+				sp_immutable:1,
 				sp_fetch_hdls:1,
 				sp_need_discard:1,
 				sp_disable_rebuild:1;
@@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis
 int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 			   unsigned int map_version);
 
-bool ds_pool_skip_for_check(struct ds_pool *pool);
-int ds_pool_start_after_check(uuid_t uuid);
-int ds_pool_start(uuid_t uuid, bool aft_chk);
+bool ds_pool_restricted(struct ds_pool *pool, bool immutable);
+int ds_pool_start_after_check(uuid_t uuid, bool immutable);
+int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable);
 int ds_pool_stop(uuid_t uuid);
 int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets,
 			const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains);
diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h
index 203b9b548ee3..28fe83852b33 100644
--- a/src/include/daos_srv/security.h
+++ b/src/include/daos_srv/security.h
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas);
 bool
 ds_sec_cont_can_evict_all(uint64_t cont_capas);
 
+/**
+ * Determine if the container can be modified based on the container security
+ * capabilities.
+ *
+ * \param[in]	cont_capas	Capability bits acquired via
+ *				ds_sec_cont_get_capabilities
+ *
+ * \return	True		Access allowed
+ *		False		Access denied
+ */
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas);
+
 /**
  * Get the security capabilities for a rebuild container handle created by the
  * DAOS server.
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index e1d6ee79bce8..7975a2115d4e 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 	tc_out->tc_ranks.ca_arrays = rank;
 	tc_out->tc_ranks.ca_count  = 1;
 
-	rc = ds_pool_start(tc_in->tc_pool_uuid, false);
+	rc = ds_pool_start(tc_in->tc_pool_uuid, false, false);
 	if (rc) {
 		D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n",
 			DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h
index 8f864c8c11a9..612f7760fd14 100644
--- a/src/pool/srv_internal.h
+++ b/src/pool/srv_internal.h
@@ -57,6 +57,12 @@ pool_tls_get()
 	return tls;
 }
 
+static inline bool
+ds_pool_skip_for_check(struct ds_pool *pool)
+{
+	return engine_in_check() && !pool->sp_cr_checked;
+}
+
 struct pool_iv_map {
 	d_rank_t	piv_master_rank;
 	uint32_t	piv_pool_map_ver;
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 2d80088d4b6b..6e3a01379fa7 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -1675,7 +1675,7 @@ init_events(struct pool_svc *svc)
 	D_ASSERT(!events->pse_stop);
 	D_ASSERT(!events->pse_paused);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool)) {
+	if (!ds_pool_restricted(svc->ps_pool, false)) {
 		rc = crt_register_event_cb(ds_pool_crt_event_cb, svc);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n",
@@ -1706,7 +1706,7 @@ init_events(struct pool_svc *svc)
 	return 0;
 
 err_cb:
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
 	if (events->pse_pending != NULL)
 		free_event_set(&events->pse_pending);
@@ -1721,7 +1721,7 @@ fini_events(struct pool_svc *svc)
 
 	D_ASSERT(events->pse_handler != ABT_THREAD_NULL);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
 
 	ABT_mutex_lock(events->pse_mutex);
@@ -2565,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid)
 	return 0;
 }
 
+struct pool_start_args {
+	bool	psa_aft_chk;
+	bool	psa_immutable;
+};
+
 /*
  * Try to start the pool. Continue the iteration upon errors as other pools may
  * still be able to work.
@@ -2572,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid)
 static int
 start_one(uuid_t uuid, void *varg)
 {
-	int rc;
+	struct pool_start_args	*psa = varg;
+	bool			 aft_chk;
+	bool			 immutable;
+	int			 rc;
+
+	if (psa != NULL) {
+		aft_chk = psa->psa_aft_chk;
+		immutable = psa->psa_immutable;
+	} else {
+		aft_chk = false;
+		immutable = false;
+	}
 
-	D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid));
+	D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n",
+		DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 
-	rc = ds_pool_start(uuid, varg != NULL ? true : false);
+	rc = ds_pool_start(uuid, aft_chk, immutable);
 	if (rc != 0) {
-		DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid));
+		DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s",
+			 DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 		ds_pool_failed_add(uuid, rc);
 	}
 
@@ -2597,12 +2615,27 @@ pool_start_all(void *arg)
 			DP_RC(rc));
 }
 
+bool
+ds_pool_restricted(struct ds_pool *pool, bool immutable)
+{
+	if (ds_pool_skip_for_check(pool))
+		return true;
+
+	if (pool->sp_immutable && !immutable)
+		return true;
+
+	return false;
+}
+
 int
-ds_pool_start_after_check(uuid_t uuid)
+ds_pool_start_after_check(uuid_t uuid, bool immutable)
 {
-	bool	aft_chk = true;
+	struct pool_start_args	psa;
 
-	return start_one(uuid, &aft_chk);
+	psa.psa_aft_chk = true;
+	psa.psa_immutable = immutable;
+
+	return start_one(uuid, &psa);
 }
 
 /* Note that this function is currently called from the main xstream. */
@@ -3820,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 			   &ds_pool_prop_connectable, &value);
 	if (rc != 0)
 		goto out_lock;
-	D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable);
 	if (!connectable) {
 		D_ERROR(DF_UUID": being destroyed, not accepting connections\n",
 			DP_UUID(in->pci_op.pi_uuid));
@@ -3829,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 
 	/*
 	 * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable()
-	 *	 the PS restart for full service. If some client tries to connect the pool during
+	 *	 and PS restart with full service. If some client tries to connect the pool during
 	 *	 such internal, it will get -DER_BUSY temporarily.
 	 */
 	if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) {
-		D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid));
-		D_GOTO(out_lock, rc = -DER_BUSY);
+		rc = -DER_BUSY;
+		D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), DP_RC(rc));
+		goto out_lock;
+	}
+
+	if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc));
+		goto out_lock;
 	}
 
 	/* Check existing pool handles. */
@@ -3997,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 		}
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags "
+		DF_X64", sec_capas " DF_X64 "\n",
+		DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl),
+		svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas);
+
 	rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver,
 				  obj_layout_ver);
 	if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) {
@@ -6522,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun
 
 	D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid));
 
-	if (ds_pool_skip_for_check(svc->ps_pool)) {
+	if (ds_pool_restricted(svc->ps_pool, false)) {
 		D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid));
 		return -DER_OP_CANCELED;
 	}
@@ -8761,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool)
 	ds_rsvc_put(rsvc);
 	return rc;
 }
-
-bool
-ds_pool_skip_for_check(struct ds_pool *pool)
-{
-	return engine_in_check() && !pool->sp_cr_checked;
-}
diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c
index cfa837e8b2a8..113fb757fd90 100644
--- a/src/pool/srv_target.c
+++ b/src/pool/srv_target.c
@@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate)
 		goto done;
 	}
 
-	if (!ds_pool_skip_for_check(child->spc_pool)) {
+	if (!ds_pool_restricted(child->spc_pool, false)) {
 		rc = start_gc_ult(child);
 		if (rc != 0)
 			goto out_close;
@@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool)
  * till ds_pool_stop. Only for mgmt and pool modules.
  */
 int
-ds_pool_start(uuid_t uuid, bool aft_chk)
+ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable)
 {
 	struct ds_pool			*pool;
 	struct daos_llink		*llink;
@@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 	else
 		pool->sp_cr_checked = 0;
 
+	if (immutable)
+		pool->sp_immutable = 1;
+	else
+		pool->sp_immutable = 0;
+
 	rc = pool_child_add_all(pool);
 	if (rc != 0)
 		goto failure_pool;
@@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 		}
 
 		pool->sp_fetch_hdls = 1;
+	}
 
+	if (!ds_pool_restricted(pool, false)) {
 		rc = ds_pool_start_ec_eph_query_ult(pool);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n",
@@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 		       DP_UUID(pool->sp_uuid), map_version_before, map_version);
 	}
 
-	if (map_updated) {
+	if (map_updated && !ds_pool_restricted(pool, false)) {
 		struct dtx_scan_args	*arg;
 		int ret;
 
-		if (ds_pool_skip_for_check(pool))
-			D_GOTO(out, rc = 0);
-
 		D_ALLOC_PTR(arg);
 		if (arg == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c
index b1f722b8254f..5c463af9e903 100644
--- a/src/rebuild/srv.c
+++ b/src/rebuild/srv.c
@@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
 		return 0;
 	}
 
-	if (ds_pool_skip_for_check(pool)) {
+	if (ds_pool_restricted(pool, false)) {
 		D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n",
 			DP_UUID(pool->sp_uuid));
 		return 0;
diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c
index ad3c0107e3b7..8f05216e9715 100644
--- a/src/security/srv_acl.c
+++ b/src/security/srv_acl.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas)
 	return (cont_capas & CONT_CAPA_EVICT_ALL) != 0;
 }
 
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas)
+{
+	return (cont_capas & CONT_CAPAS_W_MASK) != 0;
+}
+
 uint64_t
 ds_sec_get_rebuild_cont_capabilities(void)
 {
diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
index 3827a0eda6a7..4c981f296452 100644
--- a/src/tests/suite/daos_cr.c
+++ b/src/tests/suite/daos_cr.c
@@ -18,6 +18,8 @@
 
 #include <daos_mgmt.h>
 
+#include "daos_iotest.h"
+
 /*
  * Will enable accurate query result verification after DAOS-13520 resolved.
  * #define CR_ACCURATE_QUERY_RESULT	1
@@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state)
 	cr_cleanup(arg, &pool, 1);
 }
 
+#define CR_IO_SIZE	16
+
+/*
+ * 1. Create pool and container without inconsistency.
+ * 2. Write something to the container.
+ * 3. Start checker with --dry-run option.
+ * 4. Query checker, it should be completed without any inconsistency reported.
+ * 5. Verify the pool only can be connected with read-only permission.
+ * 6. Verify the container only can be opened with read-only permission.
+ * 7. Verify the object only can be opened with read-only permission.
+ * 8. Verify the object cannot be written.
+ * 9. Read former wrote data and verify its correctness.
+ * 10. Switch to normal mode and cleanup.
+ */
+static void
+cr_maintenance_mode(void **state)
+{
+	test_arg_t		*arg = *state;
+	struct test_pool	 pool = { 0 };
+	struct test_cont	 cont = { 0 };
+	daos_handle_t		 coh;
+	daos_obj_id_t		 oid;
+	struct ioreq		 req;
+	const char		*dkey = "cr_dkey";
+	const char		*akey = "cr_akey";
+	char			 update_buf[CR_IO_SIZE];
+	char			 fetch_buf[CR_IO_SIZE];
+	struct daos_check_info	 dci = { 0 };
+	int			 rc;
+
+	print_message("CR28: maintenance mode after dry-run check\n");
+
+	rc = cr_pool_create(state, &pool, true, TCC_NONE);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_cont_create(state, &pool, &cont, 0);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank);
+	arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS;
+	arg->async = 0;
+	ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg);
+	dts_buf_render(update_buf, CR_IO_SIZE);
+
+	print_message("Generate some data.\n");
+
+	insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	rc = daos_obj_close(req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_stop(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(true);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	cr_ins_wait(1, &pool.pool_uuid, &dci);
+
+	rc = cr_ins_verify(&dci, TCIS_COMPLETED);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the pool only can be connected as RDONLY mode.\n");
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the container only can be opened as RDONLY mode.\n");
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+#if 0
+	/*
+	 * NOTE: Currently, DAOS security check is server-side logic, but object open may not
+	 *	 talk with server. So related permission check will not be done until read or
+	 *	 write really happened. If not consider exclusively open and cache case, then
+	 *	 it seems fine; otherwise, need some enhancement in future.
+	 */
+
+	print_message("Verify the object only can be opened as RDONLY mode.\n");
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+#endif
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the object cannot be modified.\n");
+
+	req.arg->expect_result = -DER_NO_PERM;
+	insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	print_message("Verify former wrote data.\n");
+
+	req.arg->expect_result = 0;
+	lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE);
+	assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_start();
+	assert_rc_equal(rc, 0);
+
+	cr_dci_fini(&dci);
+	cr_cleanup(arg, &pool, 1);
+}
+
 static const struct CMUnitTest cr_tests[] = {
 	{ "CR1: start checker for specified pools",
 	  cr_start_specified, async_disable, test_case_teardown},
@@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = {
 	  cr_handle_fail_pool1, async_disable, test_case_teardown},
 	{ "CR27: handle the pool if some engine failed to report some pool service",
 	  cr_handle_fail_pool2, async_disable, test_case_teardown},
+	{ "CR28: maintenance mode after dry-run check",
+	  cr_maintenance_mode, async_disable, test_case_teardown},
 };
 
 static int

From b913d3e9685fe8b31b046d197f66477460053e68 Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Mon, 21 Oct 2024 14:19:00 -0400
Subject: [PATCH 11/26] DAOS-16265 test: Fix erasurecode/rebuild_fio.py out of
 space (#15020) (#15340)

Prevent accumulating large server log files caused by temporarily
enabling the DEBUG log mask while creating or destroying pools.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 .../ftest/erasurecode/multiple_failure.yaml   |  1 +
 src/tests/ftest/erasurecode/rebuild_fio.yaml  |  1 +
 src/tests/ftest/util/apricot/apricot/test.py  | 22 +++++++++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml
index 78f132474b5b..95aab5413291 100644
--- a/src/tests/ftest/erasurecode/multiple_failure.yaml
+++ b/src/tests/ftest/erasurecode/multiple_failure.yaml
@@ -25,6 +25,7 @@ server_config:
       storage: auto
 pool:
   size: 93%
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml
index a895c3567072..a3539d865792 100644
--- a/src/tests/ftest/erasurecode/rebuild_fio.yaml
+++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml
@@ -39,6 +39,7 @@ pool:
   aggregation:
     threshold: 50000000
     aggr_timeout: 180
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index 42e05937f37b..563ff7adecee 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs):
         self.setup_start_agents = True
         self.slurm_exclude_servers = False
         self.slurm_exclude_nodes = NodeSet()
+        self.max_test_dir_usage_check = 90
         self.host_info = HostInfo()
         self.hostlist_servers = NodeSet()
         self.hostlist_clients = NodeSet()
@@ -693,6 +694,11 @@ def setUp(self):
         self.slurm_exclude_servers = self.params.get(
             "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers)
 
+        # Max test directory usage percentage - when exceeded will display sizes of files in the
+        # test directory
+        self.max_test_dir_usage_check = self.params.get(
+            "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check)
+
         # The server config name should be obtained from each ServerManager
         # object, but some tests still use this TestWithServers attribute.
         self.server_group = self.params.get("name", "/run/server_config/*", "daos_server")
@@ -765,12 +771,20 @@ def setUp(self):
 
         # List common test directory contents before running the test
         self.log.info("-" * 100)
-        self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
+        self.log.debug(
+            "Common test directory (%s) contents (check > %s%%):",
+            os.path.dirname(self.test_dir), self.max_test_dir_usage_check)
         all_hosts = include_local_host(self.host_info.all_hosts)
         test_dir_parent = os.path.dirname(self.test_dir)
-        result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
-        if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
-            run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
+        _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
+        _details = NodeSet()
+        for _host, _stdout in _result.all_stdout.items():
+            _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout)
+            _test_dir_usage_int = int(max(_test_dir_usage + ["0"]))
+            if _test_dir_usage_int > self.max_test_dir_usage_check:
+                _details.add(_host)
+        if _details:
+            run_remote(self.log, _details, f"du -sh {test_dir_parent}/*")
         self.log.info("-" * 100)
 
         if not self.start_servers_once or self.name.uid == 1:

From ffa1c9d952fb62f72524abec119e7d829bce5633 Mon Sep 17 00:00:00 2001
From: Michael MacDonald <mjmac@google.com>
Date: Tue, 22 Oct 2024 11:38:08 -0400
Subject: [PATCH 12/26] DAOS-16693 telemetry: Avoid race between init/read
 (#15306) (#15322)

In rare cases, a reader may attempt to access a telemetry
node after it has been added to the tree, but before it
has been fully initialized. Use an atomic to prevent
reads before the initialization has completed. Unlucky
readers will get a -DER_AGAIN instead of crashing.

Signed-off-by: Michael MacDonald <mjmac@google.com>
---
 src/control/lib/telemetry/counter.go   | 20 ++++++++-----
 src/control/lib/telemetry/duration.go  | 20 ++++++++-----
 src/control/lib/telemetry/gauge.go     | 38 ++++++++++++++----------
 src/control/lib/telemetry/snapshot.go  | 20 ++++++++-----
 src/control/lib/telemetry/telemetry.go | 12 ++++++++
 src/control/lib/telemetry/timestamp.go | 22 +++++++++-----
 src/gurt/telemetry.c                   | 41 ++++++++++++++++++++++++++
 src/include/gurt/telemetry_common.h    |  3 ++
 8 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go
index 81549a32daf5..e6e59a8d0ea0 100644
--- a/src/control/lib/telemetry/counter.go
+++ b/src/control/lib/telemetry/counter.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 {
 }
 
 func (c *Counter) Value() uint64 {
+	ctrVal := BadUintVal
 	if c.handle == nil || c.node == nil {
-		return BadUintVal
+		return ctrVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
+		if res == C.DER_SUCCESS {
+			ctrVal = uint64(val)
+		}
+		return res
 	}
+	c.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return ctrVal
 }
 
 func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter {
diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go
index 1f32125bc90a..3cfd240bde76 100644
--- a/src/control/lib/telemetry/duration.go
+++ b/src/control/lib/telemetry/duration.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType {
 }
 
 func (d *Duration) Value() time.Duration {
+	durValue := BadDuration
 	if d.handle == nil || d.node == nil {
-		return BadDuration
+		return durValue
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
-	if res == C.DER_SUCCESS {
-		return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
+		if res == C.DER_SUCCESS {
+			durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+		}
+		return res
 	}
+	d.fetchValWithRetry(fetch)
 
-	return BadDuration
+	return durValue
 }
 
 func (d *Duration) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go
index ea84ff905047..93db24ab9fc9 100644
--- a/src/control/lib/telemetry/gauge.go
+++ b/src/control/lib/telemetry/gauge.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 {
 
 // Value returns the value as an unsigned integer.
 func (g *Gauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge {
@@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 {
 
 // Value returns the gauge value as an unsigned integer.
 func (g *StatsGauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge {
diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go
index 2ffa23296c33..5b2af9f07479 100644
--- a/src/control/lib/telemetry/snapshot.go
+++ b/src/control/lib/telemetry/snapshot.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType {
 }
 
 func (s *Snapshot) Value() time.Time {
+	timeVal := time.Time{} // zero val
 	if s.handle == nil || s.node == nil {
-		return time.Time{}
+		return timeVal
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+		}
+		return res
 	}
+	s.fetchValWithRetry(fetch)
 
-	return time.Time{}
+	return timeVal
 }
 
 func (s *Snapshot) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go
index bb0593240b66..479c41e2aab2 100644
--- a/src/control/lib/telemetry/telemetry.go
+++ b/src/control/lib/telemetry/telemetry.go
@@ -84,6 +84,8 @@ const (
 	BadDuration = time.Duration(BadIntVal)
 
 	PathSep = filepath.Separator
+
+	maxFetchRetries = 1
 )
 
 type (
@@ -304,6 +306,16 @@ func (mb *metricBase) String() string {
 	return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})]))
 }
 
+func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int {
+	var rc C.int
+	for i := 0; i < maxFetchRetries; i++ {
+		if rc = fetchFn(); rc == C.DER_SUCCESS {
+			return rc
+		}
+	}
+	return rc
+}
+
 func (sm *statsMetric) Min() uint64 {
 	return uint64(sm.stats.dtm_min)
 }
diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go
index 97ef5bb1ed95..c787aed488df 100644
--- a/src/control/lib/telemetry/timestamp.go
+++ b/src/control/lib/telemetry/timestamp.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType {
 }
 
 func (t *Timestamp) Value() time.Time {
-	zero := time.Time{}
+	timeVal := time.Time{} // zero val
 	if t.handle == nil || t.node == nil {
-		return zero
+		return timeVal
 	}
-	var clk C.time_t
-	res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(clk), 0)
+
+	fetch := func() C.int {
+		var clk C.time_t
+		res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(clk), 0)
+		}
+		return res
 	}
-	return zero
+	t.fetchValWithRetry(fetch)
+
+	return timeVal
 }
 
 // FloatValue converts the timestamp to time in seconds since the UNIX epoch.
diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c
index 8f08192d0720..be7762184a93 100644
--- a/src/gurt/telemetry.c
+++ b/src/gurt/telemetry.c
@@ -13,6 +13,7 @@
 #include <float.h>
 #include <pthread.h>
 #include <malloc.h>
+#include <gurt/atomic.h>
 #include <gurt/common.h>
 #include <gurt/list.h>
 #include <sys/shm.h>
@@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode,
 		goto out;
 	tmp->dtn_metric  = NULL;
 	tmp->dtn_sibling = NULL;
+	atomic_store_relaxed(&tmp->dtn_readable, false);
 
 	*newnode = node;
 out:
@@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type,
 		pthread_mutexattr_destroy(&mattr);
 		temp->dtn_protect = true;
 	}
+
+	atomic_store_relaxed(&temp->dtn_readable, true);
+
 	if (node != NULL)
 		*node = temp;
 
@@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...)
 	return rc;
 }
 
+static bool
+node_is_readable(struct d_tm_node_t *node)
+{
+	if (node == NULL)
+		return false;
+
+	return atomic_load_relaxed(&node->dtn_readable);
+}
+
 /**
  * Creates histogram counters for the given node.  It calculates the
  * extents of each bucket and creates counters at the path specified that
@@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx,
 	if (ctx == NULL || histogram == NULL || node == NULL)
 		return -DER_INVAL;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	rc = validate_node_ptr(ctx, node, &shmem);
 	if (rc != 0)
 		return rc;
@@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket,
 	if (rc != 0)
 		return rc;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	if (!has_stats(node))
 		return -DER_OP_NOT_PERMITTED;
 
@@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val,
 	if (node->dtn_type != D_TM_COUNTER)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	/* "ctx == NULL" is server side fast version to read the counter. */
 	if (ctx == NULL) {
 		metric_data = node->dtn_metric;
@@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val,
 	if (node->dtn_type != D_TM_TIMESTAMP)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo,
 	if (node->dtn_type != D_TM_MEMINFO)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_DURATION))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data == NULL)
 		return -DER_METRIC_NOT_FOUND;
@@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val,
 	if (!is_gauge(node))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		dtm_stats = conv_ptr(shmem, metric_data->dtm_stats);
@@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units,
 	if (node->dtn_type == D_TM_DIRECTORY)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h
index 4e7d3b3a02da..2068c1c0cce9 100644
--- a/src/include/gurt/telemetry_common.h
+++ b/src/include/gurt/telemetry_common.h
@@ -8,6 +8,8 @@
 
 #include <gurt/common.h>
 
+#include <stdatomic.h>
+
 #define D_TM_VERSION			1
 #define D_TM_MAX_NAME_LEN		256
 #define D_TM_MAX_DESC_LEN		128
@@ -236,6 +238,7 @@ struct d_tm_node_t {
 	pthread_mutex_t		dtn_lock; /** individual mutex */
 	struct d_tm_metric_t	*dtn_metric; /** values */
 	bool			dtn_protect; /** synchronized access */
+	_Atomic bool             dtn_readable; /** fully initialized and ready for reads */
 };
 
 struct d_tm_nodeList_t {

From 42a0d35d815505ba0435e192933160b52e55c7c2 Mon Sep 17 00:00:00 2001
From: Alexander Oganezov <alexander.a.oganezov@intel.com>
Date: Tue, 22 Oct 2024 09:10:51 -0700
Subject: [PATCH 13/26] DAOS-16696 cart: Fix rc in error path (#15313) (#15357)

- Fix rc in error path during ivo_on_update failure

Required-githooks: true

Signed-off-by: Alexander A Oganezov <alexander.a.oganezov@intel.com>
---
 src/cart/crt_iv.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c
index 603e565ac1f1..bf8124a8a6a3 100644
--- a/src/cart/crt_iv.c
+++ b/src/cart/crt_iv.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info)
 	return rc;
 
 send_error:
-	rc = crt_bulk_free(cb_info->buc_bulk_hdl);
+	/* send back whatever error got us here */
 	output->rc = rc;
+	rc         = crt_bulk_free(cb_info->buc_bulk_hdl);
+	if (rc != 0)
+		DL_ERROR(rc, "crt_bulk_free() failed");
+
 	iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value,
 			   cb_info->buc_user_priv);
 

From 1ae3f29dcbebe9a4ae123302cc11b9dfac870c8f Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Wed, 23 Oct 2024 00:57:32 +0800
Subject: [PATCH 14/26] DAOS-16574 vos: shrink DTX table blob size - b26
 (#15220) (#15221)

Use 4KB blob for committed DTX table and 16KB for active DTX table.
It is more efficient for lower allocator and reduce the possibility
of space allocation failure when space pressure.

Simplify vos_dtx_commit logic and code cleanup.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/vos/vos_dtx.c | 101 +++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 54 deletions(-)

diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 1c60f7815070..f1d69b54f4e4 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -15,11 +15,10 @@
 #include "vos_layout.h"
 #include "vos_internal.h"
 
-/* 128 KB per SCM blob */
-#define DTX_BLOB_SIZE		(1 << 17)
-/** Ensure 16-bit signed int is sufficient to store record index */
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) <  (1 << 15));
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) <  (1 << 15));
+/* 16 KB blob for each active DTX blob */
+#define DTX_ACT_BLOB_SIZE	(1 << 14)
+/* 4 KB for committed DTX blob */
+#define DTX_CMT_BLOB_SIZE	(1 << 12)
 
 #define DTX_ACT_BLOB_MAGIC	0x14130a2b
 #define DTX_CMT_BLOB_MAGIC	0x2502191c
@@ -767,7 +766,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae,
 static int
 vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch,
 		   daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p,
-		   struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal)
+		   struct vos_dtx_act_ent **dae_p, bool *rm_cos)
 {
 	struct vos_dtx_act_ent		*dae = NULL;
 	struct vos_dtx_cmt_ent		*dce = NULL;
@@ -866,10 +865,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t
 		goto out;
 
 	rc = dtx_rec_release(cont, dae, false);
-	if (rc != 0) {
-		*fatal = true;
+	if (rc != 0)
 		goto out;
-	}
 
 	D_ASSERT(dae_p != NULL);
 	*dae_p = dae;
@@ -915,7 +912,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 	umem_off_t			 dbd_off;
 	int				 rc;
 
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space when create active DTX table.\n");
 		return -DER_NOSPACE;
@@ -923,7 +920,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_ACT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 			sizeof(struct vos_dtx_act_ent_df);
 	dbd->dbd_count = 0;
 	dbd->dbd_index = 0;
@@ -1994,12 +1991,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	umem_off_t			 dbd_off;
 	uint64_t			 cmt_time = daos_gettime_coarse();
 	int				 committed = 0;
-	int				 cur = 0;
 	int				 rc = 0;
-	int				 rc1 = 0;
+	int				 p = 0;
 	int				 i = 0;
 	int				 j;
-	bool				 fatal = false;
+	int				 k;
 	bool				 allocated = false;
 
 	dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail);
@@ -2017,67 +2013,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 		goto new_blob;
 
 again:
-	for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0;
-	     i++, cur++) {
-		struct vos_dtx_cmt_ent	*dce = NULL;
-
-		rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce,
-					daes != NULL ? &daes[cur] : NULL,
-					rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal);
-		if (dces != NULL)
-			dces[cur] = dce;
-
-		if (fatal)
-			goto out;
-
-		if (rc == 0 && (daes == NULL || daes[cur] != NULL))
+	for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) {
+		rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i],
+					daes != NULL ? &daes[i] : NULL,
+					rm_cos != NULL ? &rm_cos[i] : NULL);
+		if (rc == 0 && (daes == NULL || daes[i] != NULL))
 			committed++;
 
 		if (rc == -DER_ALREADY || rc == -DER_NONEXIST)
 			rc = 0;
 
-		if (rc1 == 0)
-			rc1 = rc;
+		if (rc != 0)
+			goto out;
+
+		if (dces[i] != NULL)
+			j++;
+	}
 
-		if (dce != NULL) {
-			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j],
-					      sizeof(struct vos_dtx_cmt_ent_df),
-					      UMEM_XADD_NO_SNAPSHOT);
+	if (j > dbd->dbd_count) {
+		if (!allocated) {
+			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count],
+					      sizeof(struct vos_dtx_cmt_ent_df) *
+					      (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT);
 			if (rc != 0)
-				D_GOTO(out, fatal = true);
+				goto out;
 
-			memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base,
+			/* Only need to add range for the first partial blob. */
+			rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count));
+			if (rc != 0)
+				goto out;
+		}
+
+		for (k = dbd->dbd_count; k < j; k++, p++) {
+			while (dces[p] == NULL)
+				p++;
+
+			memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base,
 			       sizeof(struct vos_dtx_cmt_ent_df));
 		}
-	}
 
-	if (!allocated) {
-		/* Only need to add range for the first partial blob. */
-		rc = umem_tx_add_ptr(umm, &dbd->dbd_count,
-				     sizeof(dbd->dbd_count));
-		if (rc != 0)
-			D_GOTO(out, fatal = true);
+		dbd->dbd_count = j;
 	}
 
-	dbd->dbd_count = j;
-
-	if (i == count || rc1 != 0)
+	if (i == count)
 		goto out;
 
 new_blob:
 	dbd_prev = dbd;
 	/* Need new @dbd */
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space to store committed DTX %d "DF_DTI"\n",
-			count, DP_DTI(&dtis[cur]));
-		fatal = true;
+			count, DP_DTI(&dtis[i]));
 		D_GOTO(out, rc = -DER_NOSPACE);
 	}
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_CMT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 		       sizeof(struct vos_dtx_cmt_ent_df);
 	dbd->dbd_prev = umem_ptr2off(umm, dbd_prev);
 
@@ -2090,21 +2083,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 				     sizeof(cont_df->cd_dtx_committed_head) +
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		cont_df->cd_dtx_committed_head = dbd_off;
 	} else {
 		rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next,
 				     sizeof(dbd_prev->dbd_next));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		dbd_prev->dbd_next = dbd_off;
 
 		rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail,
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 	}
 
 	D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n",
@@ -2115,7 +2108,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	goto again;
 
 out:
-	return fatal ? rc : (committed > 0 ? committed : rc1);
+	return rc < 0 ? rc : committed;
 }
 
 void

From dcf84195f80f2d0367fd03264b76d7a9674fdd19 Mon Sep 17 00:00:00 2001
From: Li Wei <wei.g.li@intel.com>
Date: Wed, 23 Oct 2024 23:20:15 +0900
Subject: [PATCH 15/26] DAOS-16653 doc: Fix CRT_EVENT_DELAY description
 (#15351) (#15371)

Fix the description of the CRT_EVENT_DELAY environment variable in
docs/admin/env_variables.md.

Signed-off-by: Li Wei <wei.g.li@intel.com>
---
 docs/admin/env_variables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 31e0f3e87672..3de0a0792030 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
-|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|

From 4c49f3626726e561dfab41c21a9345573828abc3 Mon Sep 17 00:00:00 2001
From: Ken Cain <kenneth.c.cain@intel.com>
Date: Wed, 23 Oct 2024 16:54:02 -0400
Subject: [PATCH 16/26] DAOS-16650 control: dmg system exclude, update group
 version (#15288) (#15349)

With this change, when a daos administrator runs dmg system exclude
for a given set of engines, the system map version / cart primary group
version will be updated. In turn, daos_engines will more immediately
detect the "loss" of the administratively excluded engines, update
pool maps and perform rebuild. This change supports a use case of
a proactive exclusion of ranks that are expected to be impacted by
planned maintenance that would cut off connectivity to certain
engines.

Signed-off-by: Kenneth Cain <kenneth.c.cain@intel.com>
---
 src/control/server/mgmt_system.go      |  2 ++
 src/control/server/mgmt_system_test.go | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go
index e85091b5529f..a35753d95ba1 100644
--- a/src/control/server/mgmt_system.go
+++ b/src/control/server/mgmt_system.go
@@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude
 		})
 	}
 
+	svc.reqGroupUpdate(ctx, false)
+
 	return resp, nil
 }
 
diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go
index f482b1943a19..0cffe7f63d86 100644
--- a/src/control/server/mgmt_system_test.go
+++ b/src/control/server/mgmt_system_test.go
@@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 				mockMember(t, 3, 2, "joined"),
 			},
 		},
+
 		"unexclude hosts": {
 			req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true},
 			members: system.Members{
@@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 			if tc.req != nil && tc.req.Sys == "" {
 				tc.req.Sys = build.DefaultSystemName
 			}
+
+			startMapVer, err := svc.sysdb.CurMapVersion()
+			if err != nil {
+				t.Fatalf("startMapVer CurMapVersion() failed\n")
+				return
+			}
 			gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req)
 			test.CmpErr(t, tc.expAPIErr, gotAPIErr)
 			if tc.expAPIErr != nil {
 				return
 			}
 
+			// Check for any system map version increase by the (asynchronous) update.
+			// Test will time out if it never happens, thus choice of an infinite loop here.
+			for {
+				curMapVer, err := svc.sysdb.CurMapVersion()
+				if err != nil {
+					t.Fatalf("CurMapVersion() failed\n")
+					return
+				}
+
+				if curMapVer > startMapVer {
+					break
+				}
+			}
+
 			checkRankResults(t, tc.expResults, gotResp.Results)
 			checkMembers(t, tc.expMembers, svc.membership)
 		})

From 2819d4532e00b2395396ba3294e0bdc2c6a0449c Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Thu, 24 Oct 2024 15:04:10 +0800
Subject: [PATCH 17/26] DAOS-16488 chk: take sd_lock before accessing VOS
 sys_db - b26 (#15269)

The VOS sys_db may have multuiple users, such as SMD and CHK.
It is caller's duty to take lock against the VOS sys_db before
accessing it to handle concurrent operations from multiple XS.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/chk/chk_vos.c | 67 ++++++++++++++++++++++++++---------------------
 src/common/lru.c  |  4 ++-
 src/vos/sys_db.c  | 27 +++++++++++++++----
 3 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c
index fdefc2995f71..af2904affd13 100644
--- a/src/chk/chk_vos.c
+++ b/src/chk/chk_vos.c
@@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size)
 {
 	d_iov_t	key_iov;
 	d_iov_t	val_iov;
+	int	rc;
 
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
-	return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 static int
@@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
@@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size)
 	d_iov_t	key_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
 static int
 chk_db_traverse(sys_db_trav_cb_t cb, void *args)
 {
-	return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+	int	rc;
+
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 int
@@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
 
 	if (cpp->cp_rank_nr != 0 && rank_list != NULL) {
 		D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n",
@@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 		rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 		if (rc != 0)
-			goto end;
+			goto out;
 	}
 
 	d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY));
@@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-end:
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
-
 out:
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
 	if (rc != 0)
 		D_ERROR("Failed to update check property on rank %u: "DF_RC"\n",
 			dss_self_rank(), DP_RC(rc));
diff --git a/src/common/lru.c b/src/common/lru.c
index de86d367e0ea..87dbdaddaa9a 100644
--- a/src/common/lru.c
+++ b/src/common/lru.c
@@ -255,7 +255,9 @@ void
 daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
 {
 	D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
-	D_ASSERT(d_list_empty(&llink->ll_qlink));
+	D_ASSERTF(d_list_empty(&llink->ll_qlink),
+		  "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n",
+		  lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next);
 
 	lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);
 
diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c
index e7b4a2baa208..d1f6d4bce98a 100644
--- a/src/vos/sys_db.c
+++ b/src/vos/sys_db.c
@@ -349,8 +349,9 @@ vos_db_init(const char *db_path)
 int
 vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini)
 {
-	int	create;
-	int	rc;
+	ABT_mutex_attr	attr = ABT_MUTEX_ATTR_NULL;
+	int		create;
+	int		rc;
 
 	D_ASSERT(db_path != NULL);
 
@@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 		goto failed;
 	}
 
-	rc = ABT_mutex_create(&vos_db.db_lock);
+	rc = ABT_mutex_attr_create(&attr);
 	if (rc != ABT_SUCCESS) {
-		rc = -DER_NOMEM;
-		goto failed;
+		D_ERROR("Failed to create mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to set mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
 	}
 
+	rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to create mutex with attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	ABT_mutex_attr_free(&attr);
+
 	vos_db.db_poh = DAOS_HDL_INVAL;
 	vos_db.db_coh = DAOS_HDL_INVAL;
 
@@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 	}
 	return 0;
 failed:
+	if (attr != ABT_MUTEX_ATTR_NULL)
+		ABT_mutex_attr_free(&attr);
 	vos_db_fini();
 	return rc;
 }

From ec3aa1c64d02d959289ec9417df96a8e945b5a0b Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Thu, 24 Oct 2024 20:34:28 +0800
Subject: [PATCH 18/26] DAOS-16469 dtx: optimize DTX CoS cache - b26 (#15085)

If there are a lot of committable DTX entries in DTX CoS cache,
then it may be inefficient to locate the DTX entry in CoS cache
with given oid + dkey_hash, that may happen under the case of
that DTX batched commit is blocked (such as because of network
trouble) as to trigger DTX refresh (for DTX cleanup) on other
related engines. If that happened, it will increase the system
load on such engine and slow down DTX commit further more. The
patch reduces unnecessary search operation inside CoS cache.

Other changes:

1. Metrics (io/dtx/async_cmt_lat/tgt_id) for DTX asynchronously
   commit latency (with unit ms).

2. Fix a bug in sched_ult2xs() with multiple numa sockets for
   DSS_XS_OFFLOAD case.

3. Delay commit (or abort) collective DTX on the leader target
   to handle resent race.

4. Avoid blocking dtx_req_wait() if chore failed to send out
   some DTX RPC.

5. Some cleanup for error handling.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/container/srv_target.c              |   1 +
 src/dtx/dtx_common.c                    |  43 ++--
 src/dtx/dtx_cos.c                       | 308 ++++++++++++++++--------
 src/dtx/dtx_internal.h                  |   4 +
 src/dtx/dtx_resync.c                    |   4 +-
 src/dtx/dtx_rpc.c                       | 104 +++++---
 src/dtx/dtx_srv.c                       |  13 +-
 src/engine/ult.c                        |   2 +-
 src/include/daos_srv/container.h        |   2 +
 src/include/daos_srv/dtx_srv.h          |   9 +-
 src/tests/ftest/util/telemetry_utils.py |   6 +-
 11 files changed, 322 insertions(+), 174 deletions(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index a6d3d3c5286a..35ac3d4285c5 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid,
 	cont->sc_dtx_committable_coll_count = 0;
 	D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list);
 	D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list);
+	D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list);
 
 	*link = &cont->sc_list;
 	return 0;
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ff4f2dfe4ef0..ecb156729ed6 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -392,11 +392,11 @@ dtx_cleanup(void *arg)
 				if (rc == 0) {
 					D_ASSERT(dce != NULL);
 
-					rc = dtx_coll_commit(cont, dce, NULL);
+					rc = dtx_coll_commit(cont, dce, NULL, false);
 					dtx_coll_entry_put(dce);
 				}
 			} else {
-				rc = dtx_commit(cont, &dte, NULL, 1);
+				rc = dtx_commit(cont, &dte, NULL, 1, false);
 			}
 		}
 
@@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg)
 	tls->dt_batched_ult_cnt++;
 
 	/* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */
-	while (!dss_ult_exiting(dbca->dbca_commit_req) &&
+	while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) &&
 		dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) {
 		struct dtx_entry	**dtes = NULL;
-		struct dtx_cos_key	 *dcks = NULL;
 		struct dtx_coll_entry	 *dce = NULL;
 		struct dtx_stat		  stat = { 0 };
 		int			  cnt;
 		int			  rc;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL,
-					    DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce);
+					    DAOS_EPOCH_MAX, false, &dtes, NULL, &dce);
 		if (cnt == 0)
 			break;
 
@@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg)
 			/* Currently, commit collective DTX one by one. */
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, NULL, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, NULL, cnt, true);
 		}
-		dtx_free_committable(dtes, dcks, dce, cnt);
+		dtx_free_committable(dtes, NULL, dce, cnt);
 		if (rc != 0) {
 			D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n",
 			       cnt, DP_UUID(cont->sc_uuid), DP_RC(rc));
@@ -1271,7 +1270,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	uint32_t			 flags;
 	int				 status = -1;
 	int				 rc = 0;
-	int				 i;
 	bool				 aborted = false;
 	bool				 unpin = false;
 
@@ -1424,10 +1422,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		vos_dtx_mark_committable(dth);
 
 		if (dlh->dlh_coll) {
-			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL);
+			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false);
 		} else {
 			dte = &dth->dth_dte;
-			rc = dtx_commit(cont, &dte, NULL, 1);
+			rc = dtx_commit(cont, &dte, NULL, 1, false);
 		}
 
 		if (rc != 0)
@@ -1487,15 +1485,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	/* If piggyback DTX has been done everywhere, then need to handle CoS cache.
 	 * It is harmless to keep some partially committed DTX entries in CoS cache.
 	 */
-	if (result == 0 && dth->dth_cos_done) {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_del(cont, &dth->dth_dti_cos[i],
-				    &dth->dth_leader_oid, dth->dth_dkey_hash);
-	} else {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i],
-					      &dth->dth_leader_oid, dth->dth_dkey_hash);
-	}
+	dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos,
+			      dth->dth_dti_cos_count,
+			      (result == 0 && dth->dth_cos_done) ? true : false);
 
 	D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n",
 		DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash,
@@ -1654,7 +1646,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		struct dtx_coll_entry	 *dce = NULL;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT,
-					    NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce);
+					    NULL, DAOS_EPOCH_MAX, true, &dtes,
+					    dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce);
 		if (cnt <= 0)
 			D_GOTO(out, rc = cnt);
 
@@ -1675,9 +1668,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 	}
@@ -2365,9 +2358,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 		if (rc < 0) {
diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c
index 6e1d042b82b3..4c165f94d0c7 100644
--- a/src/dtx/dtx_cos.c
+++ b/src/dtx/dtx_cos.c
@@ -54,6 +54,8 @@ struct dtx_cos_rec_child {
 	d_list_t			 dcrc_gl_committable;
 	/* Link into related dcr_{reg,prio}_list. */
 	d_list_t			 dcrc_lo_link;
+	/* Link into container::sc_dtx_batched_list. */
+	d_list_t			 dcrc_batched_link;
 	union {
 		struct dtx_entry	*dcrc_dte;
 		struct dtx_coll_entry	*dcrc_dce;
@@ -61,8 +63,12 @@ struct dtx_cos_rec_child {
 	/* The DTX epoch. */
 	daos_epoch_t			 dcrc_epoch;
 	struct dtx_cos_rec		*dcrc_ptr;
+	uint64_t			 dcrc_ready_time;
 	uint32_t			 dcrc_piggyback_refs;
-	uint32_t			 dcrc_coll:1; /* For collective DTX. */
+	uint32_t			 dcrc_expcmt:1,
+					 dcrc_prio:1,
+					 dcrc_reg:1,
+					 dcrc_coll:1; /* For collective DTX. */
 };
 
 struct dtx_cos_rec_bundle {
@@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 		return -DER_NOMEM;
 	}
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count = 1;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count = 1;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count = 1;
 	}
@@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	if (dcrc == NULL)
 		return -DER_NOMEM;
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count++;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count++;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count++;
 	}
@@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = {
 	.to_rec_update	= dtx_cos_rec_update,
 };
 
+static int
+dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc)
+{
+	struct dtx_cos_key	 key;
+	d_iov_t			 kiov;
+	struct dtx_cos_rec	*dcr = dcrc->dcrc_ptr;
+	uint64_t		 time = daos_getmtime_coarse() - dcrc->dcrc_ready_time;
+	int			 rc = 0;
+
+	d_list_del(&dcrc->dcrc_gl_committable);
+	d_list_del(&dcrc->dcrc_lo_link);
+	if (!d_list_empty(&dcrc->dcrc_batched_link))
+		d_list_del_init(&dcrc->dcrc_batched_link);
+
+	if (dcrc->dcrc_expcmt)
+		dcr->dcr_expcmt_count--;
+	else if (dcrc->dcrc_prio)
+		dcr->dcr_prio_count--;
+	else
+		dcr->dcr_reg_count--;
+
+	if (dcrc->dcrc_coll)
+		cont->sc_dtx_committable_coll_count--;
+	cont->sc_dtx_committable_count--;
+
+	d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time);
+
+	if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) {
+		key.oid = dcr->dcr_oid;
+		key.dkey_hash = dcr->dcr_dkey_hash;
+		d_iov_set(&kiov, &key, sizeof(key));
+		rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
+	}
+
+	DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc,
+		  "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid));
+
+	if (dcrc->dcrc_coll)
+		dtx_coll_entry_put(dcrc->dcrc_dce);
+	else
+		dtx_entry_put(dcrc->dcrc_dte);
+
+	D_FREE(dcrc);
+
+	return rc;
+}
+
 int
 dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 		      daos_unit_oid_t *oid, daos_epoch_t epoch, bool force,
@@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	uint32_t			  count;
 	uint32_t			  i = 0;
 
+	/* Last batched commit failed, let's re-commit them. */
+	if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) {
+		dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child,
+				    dcrc_batched_link);
+		if (unlikely(dcrc->dcrc_coll)) {
+			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
+			return 1;
+		}
+
+		D_ALLOC_ARRAY(dte_buf, max_cnt);
+		if (dte_buf == NULL)
+			return -DER_NOMEM;
+
+		d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) {
+			D_ASSERT(i < max_cnt);
+			dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte);
+		}
+
+		*dtes = dte_buf;
+		return i;
+	}
+
 	/* Process collective DXT with higher priority. */
 	if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) {
 		d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) {
 			if (epoch >= dcrc->dcrc_epoch &&
 			    (dcrc->dcrc_piggyback_refs == 0 || force)) {
-				D_ALLOC_PTR(dck_buf);
-				if (dck_buf == NULL)
-					return -DER_NOMEM;
-
-				dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
-				dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-				*dcks = dck_buf;
+				if (dcks != NULL) {
+					D_ALLOC_PTR(dck_buf);
+					if (dck_buf == NULL)
+						return -DER_NOMEM;
+
+					dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
+					dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+					*dcks = dck_buf;
+				} else {
+					d_list_add_tail(&dcrc->dcrc_batched_link,
+							&cont->sc_dtx_batched_list);
+				}
 				*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 				return 1;
@@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	}
 
 	count = min(cont->sc_dtx_committable_count, max_cnt);
-	if (count == 0) {
-		*dtes = NULL;
+	if (count == 0)
 		return 0;
-	}
 
 	D_ALLOC_ARRAY(dte_buf, count);
 	if (dte_buf == NULL)
 		return -DER_NOMEM;
 
-	D_ALLOC_ARRAY(dck_buf, count);
-	if (dck_buf == NULL) {
-		D_FREE(dte_buf);
-		return -DER_NOMEM;
+	if (dcks != NULL) {
+		D_ALLOC_ARRAY(dck_buf, count);
+		if (dck_buf == NULL) {
+			D_FREE(dte_buf);
+			return -DER_NOMEM;
+		}
 	}
 
 	d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) {
@@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 				continue;
 
 			D_FREE(dte_buf);
-			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-			*dcks = dck_buf;
+			if (dcks != NULL) {
+				dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+				dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+				*dcks = dck_buf;
+			} else {
+				d_list_add_tail(&dcrc->dcrc_batched_link,
+						&cont->sc_dtx_batched_list);
+			}
 			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 			return 1;
 		}
 
 		dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte);
-		dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-		dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		if (dcks != NULL) {
+			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		} else {
+			d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list);
+		}
 
 		if (++i >= count)
 			break;
@@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	if (i == 0) {
 		D_FREE(dte_buf);
 		D_FREE(dck_buf);
-		*dtes = NULL;
 	} else {
 		*dtes = dte_buf;
-		*dcks = dck_buf;
+		if (dcks != NULL)
+			*dcks = dck_buf;
 	}
 
 	return i;
@@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 }
 
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash)
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm)
 {
 	struct dtx_cos_key		 key;
 	d_iov_t				 kiov;
 	d_iov_t				 riov;
 	struct dtx_cos_rec		*dcr;
 	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
 	int				 rc;
+	int				 i;
 
 	key.oid = *oid;
 	key.dkey_hash = dkey_hash;
 	d_iov_set(&kiov, &key, sizeof(key));
 	d_iov_set(&riov, NULL, 0);
 
-	/* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */
-
 	rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov);
 	if (rc == 0) {
 		dcr = (struct dtx_cos_rec *)riov.iov_buf;
-		d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-			if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-				dcrc->dcrc_piggyback_refs--;
-				return;
+		for (i = 0; i < count; i++) {
+			d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
+				if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i],
+					   sizeof(struct dtx_id)) == 0) {
+					if (rm) {
+						rc = dtx_cos_del_one(cont, dcrc);
+						if (rc == 0)
+							del++;
+					} else {
+						dcrc->dcrc_piggyback_refs--;
+					}
+					break;
+				}
 			}
 		}
+
+		if (del > 0)
+			d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
 	}
 }
 
@@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid,
 			   DAOS_INTENT_UPDATE, &kiov, &riov, NULL);
 
 	if (flags & DCF_COLL)
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
 	else
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
@@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid,
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 1);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_prio_count--;
-
-		D_GOTO(out, found = 1);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 2);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_reg_count--;
-
-		D_GOTO(out, found = 2);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 3);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_expcmt_count--;
-
-		D_GOTO(out, found = 3);
 	}
 
 out:
-	if (found > 0) {
+	if (found > 0)
 		d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1);
 
-		if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 &&
-		    dcr->dcr_expcmt_count == 0)
-			rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
-	}
-
 	if (rc == 0 && found == 0)
 		rc = -DER_NONEXIST;
 
-	D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO,
-		 "Remove DTX "DF_DTI" from CoS "
-		 "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n",
-		 DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash,
-		 found == 1 ? "has" : "has not", DP_RC(rc));
+	DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "Remove DTX from CoS cache "DF_UOID", key %lu",
+		  DP_UOID(*oid), (unsigned long)dkey_hash);
 
 	return rc == -DER_NONEXIST ? 0 : rc;
 }
@@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont)
 	return dcrc->dcrc_epoch;
 }
 
+/*
+ * It is inefficient to search some item on a very long list. So let's skip
+ * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal.
+ */
+#define DTX_COS_SEARCH_MAX	32
+
 void
 dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 	     daos_unit_oid_t *oid, uint64_t dkey_hash)
@@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
+	if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX)
+		goto expcmt;
+
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			dcrc->dcrc_reg = 0;
+			dcrc->dcrc_prio = 1;
 			d_list_del(&dcrc->dcrc_lo_link);
 			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 			dcr->dcr_reg_count--;
@@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		}
 	}
 
-	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-			d_list_del(&dcrc->dcrc_lo_link);
-			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
-
-			D_GOTO(out, found = true);
-		}
-	}
+expcmt:
+	if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX)
+		goto out;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0)
@@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	/* It is normal that the DTX entry (for priority) in CoS has been committed by race. */
 }
+
+void
+dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count)
+{
+	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
+	int				 rc;
+	int				 i = 0;
+	bool				 found;
+
+	while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child,
+					dcrc_batched_link)) != NULL) {
+		for (found = false; i < count && !found; i++) {
+			/*
+			 * Some entries in the sc_dtx_batched_list may have been committed by
+			 * others by race. Since the entries order in the sc_dtx_batched_list
+			 * will not be changed, let's compare with xid[i] via one cycle scan.
+			 */
+			if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) {
+				found = true;
+
+				if (rm[i]) {
+					rc = dtx_cos_del_one(cont, dcrc);
+					if (rc == 0)
+						del++;
+				}
+			}
+		}
+
+		/* There must be one in xid array that matches current dcrc. */
+		D_ASSERT(found);
+	}
+
+	if (del > 0)
+		d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
+}
diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h
index a42bcc1d7d68..06d0333dd77e 100644
--- a/src/dtx/dtx_internal.h
+++ b/src/dtx/dtx_internal.h
@@ -213,6 +213,7 @@ struct dtx_pool_metrics {
 struct dtx_tls {
 	struct d_tm_node_t	*dt_committable;
 	struct d_tm_node_t	*dt_dtx_leader_total;
+	struct d_tm_node_t	*dt_async_cmt_lat;
 	uint64_t		 dt_agg_gen;
 	uint32_t		 dt_batched_ult_cnt;
 };
@@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont);
 void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		  daos_unit_oid_t *oid, uint64_t dkey_hash);
 
+void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[],
+			 uint32_t count);
+
 /* dtx_rpc.c */
 int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte,
 	      daos_epoch_t epoch);
diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c
index b98a6469954c..756dfe0ca440 100644
--- a/src/dtx/dtx_resync.c
+++ b/src/dtx/dtx_resync.c
@@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont,
 	}
 
 	if (j > 0) {
-		rc = dtx_commit(cont, dtes, dcks, j);
+		rc = dtx_commit(cont, dtes, dcks, j, true);
 		if (rc < 0)
 			D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n",
 				DP_RC(rc));
@@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un
 
 		dck.oid = oid;
 		dck.dkey_hash = dkey_hash;
-		rc = dtx_coll_commit(cont, dce, &dck);
+		rc = dtx_coll_commit(cont, dce, &dck, true);
 	}
 
 	dtx_coll_entry_put(dce);
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index e654047a6218..5423f6c0cc88 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info)
 	}
 
 out:
-	D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
-		"epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future,
-		drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc);
+	DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
+		  "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future,
+		  drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0);
 
 	drr->drr_comp = 1;
 	drr->drr_result = rc;
@@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance)
 
 		if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 &&
 			     DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT)))
-			rc = dtx_req_send(dca->dca_drr, 1);
+			dtx_req_send(dca->dca_drr, 1);
 		else
-			rc = dtx_req_send(dca->dca_drr, dca->dca_epoch);
-		if (rc != 0) {
-			/* If the first sub-RPC failed, then break, otherwise
-			 * other remote replicas may have already received the
-			 * RPC and executed it, so have to go ahead.
-			 */
-			if (dca->dca_i == 0) {
-				ABT_future_free(&dra->dra_future);
-				return rc;
-			}
-		}
+			dtx_req_send(dca->dca_drr, dca->dca_epoch);
+		/*
+		 * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future.
+		 * Each RPC is independent from the others, let's go head to handle the other RPCs
+		 * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait.
+		 */
 
 		/* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */
 		dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next,
@@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance)
 	rc = dtx_req_list_send(dca, is_reentrance);
 	if (rc == DSS_CHORE_YIELD)
 		return DSS_CHORE_YIELD;
-	if (rc == DSS_CHORE_DONE)
-		rc = 0;
-	if (rc != 0)
-		dca->dca_dra.dra_result = rc;
-	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore,
-		 dca->dca_dra.dra_opc, rc);
+	D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n",
+		  dca->dca_dra.dra_opc, rc);
 	if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) {
 		rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0);
 		D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc);
@@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
 			}
 
 			rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper);
-			if (rc != 0)
+			if (rc != 0) {
+				ABT_eventual_free(&dca->dca_chore_eventual);
 				goto out;
+			}
 
 			rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL);
 			D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc);
@@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
  */
 int
 dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	   struct dtx_cos_key *dcks, int count)
+	   struct dtx_cos_key *dcks, int count, bool has_cos)
 {
 	struct dtx_common_args	 dca;
 	struct dtx_req_args	*dra = &dca.dca_dra;
@@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 			rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count,
 						DTE_PARTIAL_COMMITTED);
 	} else {
-		if (dcks != NULL) {
+		if (has_cos) {
 			if (count > 1) {
 				D_ALLOC_ARRAY(rm_cos, count);
 				if (rm_cos == NULL)
@@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 		}
 
 		if (rc1 == 0 && rm_cos != NULL) {
-			for (i = 0; i < count; i++) {
-				if (rm_cos[i]) {
-					D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
-					dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
-						    dcks[i].dkey_hash);
+			if (dcks != NULL) {
+				for (i = 0; i < count; i++) {
+					if (rm_cos[i]) {
+						D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
+						dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
+							    dcks[i].dkey_hash);
+					}
 				}
+			} else {
+				dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count);
 			}
 		}
 
@@ -1237,7 +1235,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 			case DTX_ST_COMMITTABLE:
 				dck.oid = dsp->dsp_oid;
 				dck.dkey_hash = dsp->dsp_dkey_hash;
-				rc = dtx_commit(cont, &pdte, &dck, 1);
+				rc = dtx_commit(cont, &pdte, &dck, 1, true);
 				if (rc < 0 && rc != -DER_NONEXIST && for_io)
 					d_list_add_tail(&dsp->dsp_link, cmt_list);
 				else
@@ -1258,7 +1256,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		case DSHR_NEED_COMMIT: {
 			dck.oid = dsp->dsp_oid;
 			dck.dkey_hash = dsp->dsp_dkey_hash;
-			rc = dtx_commit(cont, &pdte, &dck, 1);
+			rc = dtx_commit(cont, &pdte, &dck, 1, true);
 			if (rc < 0 && rc != -DER_NONEXIST && for_io)
 				d_list_add_tail(&dsp->dsp_link, cmt_list);
 			else
@@ -1571,17 +1569,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret)
 {
 	int	rc;
 
-	rc = ABT_future_wait(dcra->dcra_future);
-	D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
-		 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
-		 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
-	ABT_future_free(&dcra->dcra_future);
+	if (dcra->dcra_future != ABT_FUTURE_NULL) {
+		rc = ABT_future_wait(dcra->dcra_future);
+		D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
+			 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
+			 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
+		ABT_future_free(&dcra->dcra_future);
+	}
 
 	return ret != 0 ? ret : dcra->dcra_result;
 }
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck)
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos)
 {
 	struct dtx_coll_rpc_args	 dcra = { 0 };
 	int				*results = NULL;
@@ -1591,11 +1592,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	int				 rc1 = 0;
 	int				 rc2 = 0;
 	int				 i;
+	bool				 cos = true;
 
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra);
 
+	/*
+	 * NOTE: Before committing the DTX on remote participants, we cannot remove the active
+	 *	 DTX locally; otherwise, the local committed DTX entry may be removed via DTX
+	 *	 aggregation before remote participants commit done. Under such case, if some
+	 *	 remote DTX participant triggere DTX_REFRESH for such DTX during the interval,
+	 *	 then it will get -DER_TX_UNCERTAIN, that may cause related application to be
+	 *	 failed. So here, we let remote participants to commit firstly, if failed, we
+	 *	 will ask the leader to retry the commit until all participants got committed.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0,
 					  DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
@@ -1634,8 +1646,12 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	 *	 to remove the collective DTX entry from the CoS even if the commit failed remotely.
 	 *	 Otherwise, the batched commit ULT may be blocked by such "bad" entry.
 	 */
-	if (rc2 == 0 && dck != NULL)
-		dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+	if (rc2 == 0 && has_cos) {
+		if (dck != NULL)
+			dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+		else
+			dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1);
+	}
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
 		 "Collectively commit DTX "DF_DTI": %d/%d/%d\n",
@@ -1658,7 +1674,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra);
 
+	/*
+	 * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX
+	 *	 participant(s). Under such case, the client side RPC sponsor may also hit
+	 *	 the RPC timeout and resends related RPC to the leader. Here, to avoid DTX
+	 *	 abort and resend RPC forwarding being executed in parallel, we will abort
+	 *	 local DTX after remote done, before that the logic of handling resent RPC
+	 *	 on server will find the local pinned DTX entry then notify related client
+	 *	 to resend sometime later.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch,
 					  DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 93cc6744a2df..9e78100c451b 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 
 	tls->dt_agg_gen = 1;
 	rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE,
-			     "total number of committable DTX entries",
-			     "entries", "io/dtx/committable/tgt_%u", tgt_id);
+			     "total number of committable DTX entries", "entry",
+			     "io/dtx/committable/tgt_%u", tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create DTX committable metric: " DF_RC"\n",
 		       DP_RC(rc));
@@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 		D_WARN("Failed to create DTX leader metric: " DF_RC"\n",
 		       DP_RC(rc));
 
+	rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE,
+			     "DTX async commit latency", "ms",
+			     "io/dtx/async_cmt_lat/tgt_%u", tgt_id);
+	if (rc != DER_SUCCESS)
+		D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n",
+		       DP_RC(rc));
+
 	return tls;
 }
 
@@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id)
 
 	rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER,
 			     "total number of processed sync DTX_COMMIT", "ops",
-			     "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id);
+			     "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc));
 
diff --git a/src/engine/ult.c b/src/engine/ult.c
index 94a2a0f93908..fbeb3f538fa8 100644
--- a/src/engine/ult.c
+++ b/src/engine/ult.c
@@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id)
 		break;
 	case DSS_XS_OFFLOAD:
 		if (dss_numa_nr > 1)
-			xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
+			return sched_ult2xs_multisocket(xs_type, tgt_id);
 		if (!dss_helper_pool) {
 			if (dss_tgt_offload_xs_nr > 0)
 				xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index 793d585c07a4..f99f4df14e39 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -134,6 +134,8 @@ struct ds_cont_child {
 	d_list_t		 sc_dtx_cos_list;
 	/* The global list for committable collective DTXs. */
 	d_list_t		 sc_dtx_coll_list;
+	/* The list for current DTX batched commit. */
+	d_list_t		 sc_dtx_batched_list;
 	/* the pool map version of updating DAOS_PROP_CO_STATUS prop */
 	uint32_t		 sc_status_pm_ver;
 };
diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h
index 7b3f7e9ee57c..7c60d2deaa03 100644
--- a/src/include/daos_srv/dtx_srv.h
+++ b/src/include/daos_srv/dtx_srv.h
@@ -318,8 +318,8 @@ int
 dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
 		      int max, struct dtx_id **dtis);
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash);
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm);
 int
 dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
 		    dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg);
@@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		 daos_epoch_t epoch);
 
 int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	       struct dtx_cos_key *dcks, int count);
+	       struct dtx_cos_key *dcks, int count, bool has_cos);
 
 int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch);
 
 int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont);
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck);
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos);
 
 int
 dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch);
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index db424b6de685..17f1753b1003 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -90,6 +90,7 @@ class TelemetryUtils():
         "engine_pool_ops_dtx_coll_commit",
         "engine_pool_ops_dtx_commit",
         "engine_pool_ops_dtx_refresh",
+        "engine_pool_ops_dtx_sync_commit",
         "engine_pool_ops_ec_agg",
         "engine_pool_ops_ec_rep",
         "engine_pool_ops_fetch",
@@ -200,6 +201,8 @@ class TelemetryUtils():
         "engine_dmabuff_queued_reqs",
         "engine_dmabuff_grab_errs",
         *_gen_stats_metrics("engine_dmabuff_grab_retries")]
+    ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \
+        _gen_stats_metrics("engine_io_dtx_async_cmt_lat")
     ENGINE_IO_DTX_COMMITTABLE_METRICS = \
         _gen_stats_metrics("engine_io_dtx_committable")
     ENGINE_IO_DTX_COMMITTED_METRICS = \
@@ -304,7 +307,8 @@ class TelemetryUtils():
         _gen_stats_metrics("engine_io_ops_tgt_update_active")
     ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \
         _gen_stats_metrics("engine_io_ops_update_active")
-    ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\
+    ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\
+        ENGINE_IO_DTX_COMMITTABLE_METRICS +\
         ENGINE_IO_DTX_COMMITTED_METRICS +\
         ENGINE_IO_LATENCY_FETCH_METRICS +\
         ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\

From 2b5620b230df6a7c3700f10128998cfb5360b2fc Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Thu, 24 Oct 2024 10:47:16 -0500
Subject: [PATCH 19/26] DAOS-14262 cart: add ability to select traffic class
 for SWIM context  (#14893) (#14917)

Add SWIM_TRAFFIC_CLASS env var (default is unspec)

Signed-off-by: Jerome Soumagne <jerome.soumagne@intel.com>
---
 src/cart/README.env           | 12 +++++++-----
 src/cart/crt_hg.c             |  3 +++
 src/cart/crt_init.c           | 36 ++++++++++++++++++++++++++---------
 src/cart/crt_internal_types.h | 15 +++++++++++++++
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/src/cart/README.env b/src/cart/README.env
index 00f270d7a413..b90939c8c720 100644
--- a/src/cart/README.env
+++ b/src/cart/README.env
@@ -1,13 +1,10 @@
 This file lists the environment variables used in CaRT.
 
  . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR)
-   It determines which mercury NA plugin to be used:
+   It determines which mercury NA plugin and transport to be used:
    - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider
-   - set it as "ofi+gni" to use OFI gni provider
    - set it as "sm" to use SM plugin which only works within single node
-   - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider.
-   - set it as "ofi+sockets" to use OFI sockets provider
-      NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm"
+   - set it as "ofi+tcp" to use OFI tcp provider.
    - by default (not set or set as any other value) it will use ofi tcp
       provider.
 
@@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT.
    start copying data in an effort to release multi-recv buffers. Copy will occur when at
    most D_MRECV_BUF_COPY buffers remain.
 
+   SWIM_TRAFFIC_CLASS
+   (server only) Select a traffic class for the SWIM protocol to use and prevent potential
+   traffic congestion. Available options are: "unspec" (default), "best_effort",
+   "low_latency", "bulk_data".
+
diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
index 789d75ceb31b..244373c9228e 100644
--- a/src/cart/crt_hg.c
+++ b/src/cart/crt_hg.c
@@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
 	init_info.request_post_incr         = crt_gdata.cg_post_incr;
 	init_info.multi_recv_op_max         = crt_gdata.cg_mrecv_buf;
 	init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy;
+	/* Separate SWIM traffic in an effort to prevent potential congestion. */
+	if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx)
+		init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
 
 	hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
 	if (hg_class == NULL) {
diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c
index 21fc184d4469..48d2090a5b36 100644
--- a/src/cart/crt_init.c
+++ b/src/cart/crt_init.c
@@ -18,6 +18,10 @@ static volatile int	gdata_init_flag;
 struct crt_plugin_gdata crt_plugin_gdata;
 static bool		g_prov_settings_applied[CRT_PROV_COUNT];
 
+#define X(a, b) b,
+static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES};
+#undef X
+
 static void
 crt_lib_init(void) __attribute__((__constructor__));
 
@@ -237,18 +241,30 @@ crt_gdata_dump(void)
 	DUMP_GDATA_FIELD("%d", cg_rpc_quota);
 }
 
+static enum crt_traffic_class
+crt_str_to_tc(const char *str)
+{
+	enum crt_traffic_class i = 0;
+
+	while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN)
+		i++;
+
+	return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i;
+}
+
 /* first step init - for initializing crt_gdata */
 static int data_init(int server, crt_init_options_t *opt)
 {
-	uint32_t        timeout = 0;
-	uint32_t	credits;
-	uint32_t	fi_univ_size = 0;
-	uint32_t	mem_pin_enable = 0;
-	uint32_t        is_secondary;
-	uint32_t        post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
-	unsigned int    mrecv_buf      = CRT_HG_MRECV_BUF;
-	unsigned int    mrecv_buf_copy = 0; /* buf copy disabled by default */
-	int             rc             = 0;
+	uint32_t     timeout = 0;
+	uint32_t     credits;
+	uint32_t     fi_univ_size   = 0;
+	uint32_t     mem_pin_enable = 0;
+	uint32_t     is_secondary;
+	uint32_t     post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
+	unsigned int mrecv_buf          = CRT_HG_MRECV_BUF;
+	unsigned int mrecv_buf_copy     = 0; /* buf copy disabled by default */
+	char        *swim_traffic_class = NULL;
+	int          rc                 = 0;
 
 	crt_env_dump();
 
@@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt)
 	crt_gdata.cg_mrecv_buf = mrecv_buf;
 	crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy);
 	crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy;
+	crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class);
+	crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class);
 
 	is_secondary = 0;
 	/* Apply CART-890 workaround for server side only */
diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h
index 38133e0cf170..c3eb9cae12cb 100644
--- a/src/cart/crt_internal_types.h
+++ b/src/cart/crt_internal_types.h
@@ -42,6 +42,17 @@ struct crt_na_config {
 	char            **noc_domain_str; /* Array of domains */
 };
 
+#define CRT_TRAFFIC_CLASSES                                                                        \
+	X(CRT_TC_UNSPEC, "unspec")           /* Leave it upon plugin to choose */                  \
+	X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */                                     \
+	X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */                                     \
+	X(CRT_TC_BULK_DATA, "bulk_data")     /* Bulk data */                                       \
+	X(CRT_TC_UNKNOWN, "unknown")         /* Unknown */
+
+#define X(a, b) a,
+enum crt_traffic_class { CRT_TRAFFIC_CLASSES };
+#undef X
+
 struct crt_prov_gdata {
 	/** NA plugin type */
 	int			cpg_provider;
@@ -105,6 +116,9 @@ struct crt_gdata {
 	/** global swim index for all servers */
 	int32_t			cg_swim_crt_idx;
 
+	/** traffic class used by SWIM */
+	enum crt_traffic_class   cg_swim_tc;
+
 	/** credits limitation for #in-flight RPCs per target EP CTX */
 	uint32_t		cg_credit_ep_ctx;
 
@@ -220,6 +234,7 @@ struct crt_event_cb_priv {
 	ENV(SWIM_PING_TIMEOUT)                                                                     \
 	ENV(SWIM_PROTOCOL_PERIOD_LEN)                                                              \
 	ENV(SWIM_SUSPECT_TIMEOUT)                                                                  \
+	ENV_STR(SWIM_TRAFFIC_CLASS)                                                                \
 	ENV_STR(UCX_IB_FORK_INIT)
 
 /* uint env */

From 70b12e3331fea2c44beeba6792b316f1157eec51 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 25 Oct 2024 00:35:58 +0800
Subject: [PATCH 20/26] DAOS-16469 container: Lower log level for
 cont_aggregate_interval (#15283)

To reduce the side-effect caused by frequent log with -DER_INPROGRESS.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/container/srv_target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index 35ac3d4285c5..caf2a4b2ee74 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb,
 		if (rc == -DER_SHUTDOWN) {
 			break;	/* pool destroyed */
 		} else if (rc < 0) {
-			DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc,
+			DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc,
 				  DF_CONT ": %s aggregate failed",
 				  DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
 				  param->ap_vos_agg ? "VOS" : "EC");

From 2a1892f02c08135e9d7d90d23c589df30068b330 Mon Sep 17 00:00:00 2001
From: Jeff Olivier <jeffolivier@google.com>
Date: Thu, 24 Oct 2024 12:33:26 -0600
Subject: [PATCH 21/26] DAOS-16716 ci: Set reference build for PRs (#15379)

Release branch PRs should use the release branch build
instead of master branch build for Fault Injection reference

Signed-off-by: Jeff Olivier <jeffolivier@google.com>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 13ccc5128950..8b2c027b15f8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1038,7 +1038,7 @@ pipeline {
                     }
                     post {
                         always {
-                            discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master',
+                            discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6',
                                                       scm: 'daos-stack/daos',
                                                       requiredResult: hudson.model.Result.UNSTABLE
                             recordIssues enabledForFailure: true,

From 23f0787ab188704aeb0c0d000e2e181777831690 Mon Sep 17 00:00:00 2001
From: Alexander Oganezov <alexander.a.oganezov@intel.com>
Date: Fri, 25 Oct 2024 08:51:07 -0700
Subject: [PATCH 22/26] DAOS-15914: crt_reply_send_input_free() (#14817)

- New crt_reply_send_input_free() API added which releases input buffer right
  after HG_Respond() instead of waiting until the handle is destroyed.
- srv_obj.c calls changed to use new crt_reply_send_input_free()
- I/O context takes refcount on RPC
- only release input buffer for target update

Signed-off-by: Alexander A Oganezov <alexander.a.oganezov@intel.com>
Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Co-authored-by: Liang Zhen <liang.zhen@intel.com>
---
 src/cart/crt_hg.c      | 10 +++++++++
 src/cart/crt_rpc.c     | 20 ++++++++++++++++++
 src/cart/crt_rpc.h     | 47 +++++++++++++++++++++---------------------
 src/dtx/dtx_srv.c      |  2 +-
 src/include/cart/api.h | 15 ++++++++++++++
 src/object/srv_obj.c   | 22 ++++++++++++++------
 6 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
index 244373c9228e..26b54b52ec3d 100644
--- a/src/cart/crt_hg.c
+++ b/src/cart/crt_hg.c
@@ -1482,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv)
 		rc = crt_hgret_2_der(hg_ret);
 	}
 
+	/* Release input buffer */
+	if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) {
+		hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl);
+		if (hg_ret != HG_SUCCESS) {
+			RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n",
+				  DP_HG_RC(hg_ret));
+			/* Fall through */
+		}
+	}
+
 	return rc;
 }
 
diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c
index 72d8214aa09c..ead03d1bf290 100644
--- a/src/cart/crt_rpc.c
+++ b/src/cart/crt_rpc.c
@@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
 	return rc;
 }
 
+int
+crt_reply_send_input_free(crt_rpc_t *req)
+{
+	struct crt_rpc_priv *rpc_priv = NULL;
+	int                  rc       = 0;
+
+	if (req == NULL) {
+		D_ERROR("invalid parameter (NULL req).\n");
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	rpc_priv                          = container_of(req, struct crt_rpc_priv, crp_pub);
+	rpc_priv->crp_release_input_early = 1;
+
+	return crt_reply_send(req);
+
+out:
+	return rc;
+}
+
 int
 crt_reply_send(crt_rpc_t *req)
 {
diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h
index a5e6afee7800..3a5909331038 100644
--- a/src/cart/crt_rpc.h
+++ b/src/cart/crt_rpc.h
@@ -166,29 +166,30 @@ struct crt_rpc_priv {
 	 * match with crp_req_hdr.cch_flags.
 	 */
 	uint32_t		crp_flags;
-	uint32_t		crp_srv:1, /* flag of server received request */
-				crp_output_got:1,
-				crp_input_got:1,
-				/* flag of collective RPC request */
-				crp_coll:1,
-				/* flag of crp_tgt_uri need to be freed */
-				crp_uri_free:1,
-				/* flag of forwarded rpc for corpc */
-				crp_forward:1,
-				/* flag of in timeout binheap */
-				crp_in_binheap:1,
-				/* set if a call to crt_req_reply pending */
-				crp_reply_pending:1,
-				/* set to 1 if target ep is set */
-				crp_have_ep:1,
-				/* RPC is tracked by the context */
-				crp_ctx_tracked:1,
-				/* 1 if RPC fails HLC epsilon check */
-				crp_fail_hlc:1,
-				/* RPC completed flag */
-				crp_completed:1,
-				/* RPC originated from a primary provider */
-				crp_src_is_primary:1;
+	uint32_t                 crp_srv : 1, /* flag of server received request */
+	    crp_output_got : 1, crp_input_got : 1,
+	    /* flag of collective RPC request */
+	    crp_coll                : 1,
+	    /* flag of crp_tgt_uri need to be freed */
+	    crp_uri_free            : 1,
+	    /* flag of forwarded rpc for corpc */
+	    crp_forward             : 1,
+	    /* flag of in timeout binheap */
+	    crp_in_binheap          : 1,
+	    /* set if a call to crt_req_reply pending */
+	    crp_reply_pending       : 1,
+	    /* set to 1 if target ep is set */
+	    crp_have_ep             : 1,
+	    /* RPC is tracked by the context */
+	    crp_ctx_tracked         : 1,
+	    /* 1 if RPC fails HLC epsilon check */
+	    crp_fail_hlc            : 1,
+	    /* RPC completed flag */
+	    crp_completed           : 1,
+	    /* RPC originated from a primary provider */
+	    crp_src_is_primary      : 1,
+	    /* release input buffer early */
+	    crp_release_input_early : 1;
 
 	struct crt_opc_info	*crp_opc_info;
 	/* corpc info, only valid when (crp_coll == 1) */
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 9e78100c451b..0fd4e7c513f4 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -337,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc)
 	dout->do_status = rc;
 	/* For DTX_COMMIT, it is the count of real committed DTX entries. */
 	dout->do_misc = committed;
-	rc = crt_reply_send(rpc);
+	rc = crt_reply_send_input_free(rpc);
 	if (rc != 0)
 		D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc,
 			DP_RC(rc));
diff --git a/src/include/cart/api.h b/src/include/cart/api.h
index 21941d7cb983..16fc2091d4b2 100644
--- a/src/include/cart/api.h
+++ b/src/include/cart/api.h
@@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg);
 int
 crt_reply_send(crt_rpc_t *req);
 
+/**
+ * Send an RPC reply and free the input buffer immediately.
+ * Only to be called on the server side.
+ *
+ * \param[in] req              pointer to RPC request
+ *
+ * \return                     DER_SUCCESS on success, negative value if error
+ *
+ * \note the crt_rpc_t is exported to user, caller should fill the
+ *        crt_rpc_t::cr_output before sending the RPC reply.
+ *        See \ref crt_req_create.
+ */
+int
+crt_reply_send_input_free(crt_rpc_t *req);
+
 /**
  * Return request buffer
  *
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index a51682b47853..f220149a5326 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc,
 }
 
 static void
-obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
+obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input,
 	     struct obj_io_context *ioc)
 {
 	struct obj_rw_out	*orwo = crt_reply_get(rpc);
@@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
 		ioc->ioc_map_ver, orwo->orw_epoch, status);
 
 	if (!ioc->ioc_lost_reply) {
-		rc = crt_reply_send(rpc);
+		if (release_input)
+			rc = crt_reply_send_input_free(rpc);
+		else
+			rc = crt_reply_send(rpc);
+
 		if (rc != 0)
 			D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc));
 	} else {
@@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc
 
 	D_ASSERT(ioc != NULL);
 	memset(ioc, 0, sizeof(*ioc));
+
+	crt_req_addref(rpc);
 	ioc->ioc_rpc = rpc;
 	ioc->ioc_opc = opc_get(rpc->cr_opc);
 	rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh);
@@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err)
 		ds_cont_child_put(ioc->ioc_coc);
 		ioc->ioc_coc = NULL;
 	}
+	if (ioc->ioc_rpc) {
+		crt_req_decref(ioc->ioc_rpc);
+		ioc->ioc_rpc = NULL;
+	}
 }
 
 /* Setup lite IO context, it is only for compound RPC so far:
@@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc)
 	rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey,
 				  &iod->iod_name, &recx);
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc)
 		D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid),
 			DP_RC(rc1));
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc)
 out:
 	if (dth != NULL)
 		rc = dtx_end(dth, ioc.ioc_coc, rc);
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, true, &ioc);
 	D_FREE(mbs);
 	obj_ioc_end(&ioc, rc);
 }
@@ -3073,7 +3083,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	if (ioc.ioc_map_ver < max_ver)
 		ioc.ioc_map_ver = max_ver;
 
-	obj_rw_reply(rpc, rc, epoch.oe_value, &ioc);
+	obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc);
 	D_FREE(mbs);
 	D_FREE(dti_cos);
 	obj_ioc_end(&ioc, rc);

From 67da3b9b924c16b7b240fcac94e6cdd3044d6415 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 25 Oct 2024 23:52:33 +0800
Subject: [PATCH 23/26] DAOS-16721 object: fix coll RPC for obj with sparse
 layout - b26 (#15376)

The old implementation did not correctly calculate some collective
object RPC size, and may cause trouble when need bulk data transfer
for large collective object RPC. It also potentially affects how to
dispatch collective RPCs from leader to other engines.

The patch also addes more sanity check for coll-punch RPC to detect
potential DRAM corruption.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/dtx/dtx_coll.c        |   6 ++-
 src/dtx/dtx_rpc.c         |   5 +-
 src/object/cli_coll.c     | 111 +++++++++++++++++++++++++-------------
 src/object/cli_shard.c    |   7 +--
 src/object/obj_internal.h |   8 ++-
 src/object/obj_utils.c    |   2 +-
 src/object/srv_coll.c     |  12 +++--
 src/object/srv_obj.c      |   8 +--
 src/vos/vos_dtx.c         |   3 --
 9 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 863307e9a7f6..7e7c81991551 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -80,8 +80,12 @@ dtx_coll_prep_ult(void *arg)
 				DP_UUID(cont->sc_uuid), DP_RC(rc));
 	}
 
-	if (dcpa->dcpa_result != 0)
+	if (dcpa->dcpa_result != 0) {
+		if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
+			D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n",
+				DP_DTI(&dci->dci_xid), opc, DP_RC(rc));
 		goto out;
+	}
 
 	dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1,
 					  dci->dci_version, cont->sc_pool->spc_map_version,
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index 5423f6c0cc88..2ccbfec2734d 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -974,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		if (dsp->dsp_mbs == NULL) {
 			rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs);
 			if (rc != 0) {
-				if (rc < 0 && rc != -DER_NONEXIST && for_io)
+				if (rc < 0 && rc != -DER_NONEXIST && for_io) {
+					D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n",
+						DP_DTI(&dsp->dsp_xid), DP_RC(rc));
 					goto out;
+				}
 
 				drop = true;
 				goto next;
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index e05abadf3cf4..3285bec58b34 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		if (coa->coa_sparse == 0)
 			coa->coa_dct_cap = obj_ranks;
 	}
+
+	/* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */
+	coa->coa_min_rank = obj->cob_min_rank;
+
 	D_RWLOCK_UNLOCK(&obj->cob_lock);
 
 	if (coa->coa_sparse) {
@@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		coa->coa_dct_nr = -1;
 	}
 
-	coa->coa_max_dct_sz = 0;
 	coa->coa_max_shard_nr = 0;
 	coa->coa_max_bitmap_sz = 0;
 	coa->coa_target_nr = 0;
@@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa)
 	coa->coa_dct_nr = 0;
 }
 
+static void
+obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct,
+		      uint32_t *size, bool copy)
+{
+	struct daos_coll_shard	*dcs;
+	uint32_t		 dct_size;
+	int			 i;
+
+	/* The size may be over estimated, no matter. */
+	dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
+		   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
+
+	for (i = 0; i <= dct->dct_max_shard; i++) {
+		dcs = &dct->dct_shards[i];
+		if (dcs->dcs_nr > 1)
+			dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
+	}
+
+	if (coa->coa_for_modify)
+		dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
+
+	if (coa->coa_max_dct_sz < dct_size)
+		coa->coa_max_dct_sz = dct_size;
+
+	if (copy)
+		memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
+
+	coa->coa_dct_nr++;
+	*size += dct_size;
+}
+
+struct obj_coll_tree_args {
+	struct coll_oper_args	*coa;
+	uint32_t		*size;
+};
+
 static int
 obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 {
-	struct coll_oper_args	*coa = arg;
-	struct daos_coll_target	*dct = val->iov_buf;
+	struct obj_coll_tree_args	*octa = arg;
+	struct coll_oper_args		*coa = octa->coa;
+	struct daos_coll_target		*dct = val->iov_buf;
 
 	D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap,
 		  "Too short pre-allcoated dct_array: %u vs %u\n",
 		  coa->coa_dct_nr, coa->coa_dct_cap);
+	D_ASSERT(dct->dct_bitmap != NULL);
 
-	memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct));
+	obj_coll_collapse_one(coa, dct, octa->size, true);
 
 	/* The following members have been migrated into coa->coa_dcts. */
 	dct->dct_bitmap = NULL;
@@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 static int
 obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 {
+	struct obj_coll_tree_args	 octa;
 	struct coll_sparse_targets	*tree = coa->coa_tree;
 	int				 rc = 0;
 
@@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 		D_GOTO(out, rc = -DER_NOMEM);
 
 	coa->coa_sparse = 0;
-	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa);
+	coa->coa_raw_sparse = 1;
+	coa->coa_dct_nr = 0;
+	coa->coa_max_dct_sz = 0;
+
+	octa.coa = coa;
+	octa.size = size;
+	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false,
+			    obj_coll_tree_cb, &octa);
 	if (rc == 0)
 		D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap,
 			  "Something is wrong when prepare coll target array: %u vs %u\n",
@@ -287,36 +336,13 @@ static int
 obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size)
 {
 	struct daos_coll_target	*dct;
-	struct daos_coll_shard	*dcs;
-	uint32_t		 dct_size;
 	int			 i;
-	int			 j;
 
-	for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) {
+	for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0;
+	     i < coa->coa_dct_cap; i++) {
 		dct = &coa->coa_dcts[i];
-		if (dct->dct_bitmap != NULL) {
-			/* The size may be over estimated, no matter. */
-			dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
-				   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
-
-			for (j = 0; j <= dct->dct_max_shard; j++) {
-				dcs = &dct->dct_shards[j];
-				if (dcs->dcs_nr > 1)
-					dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
-			}
-
-			if (coa->coa_for_modify)
-				dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
-
-			if (coa->coa_max_dct_sz < dct_size)
-				coa->coa_max_dct_sz = dct_size;
-
-			if (coa->coa_dct_nr < i)
-				memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
-
-			coa->coa_dct_nr++;
-			*size += dct_size;
-		}
+		if (dct->dct_bitmap != NULL)
+			obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i);
 	}
 
 	/* Reset the other dct slots to avoid double free during cleanup. */
@@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj,
 
 	D_RWLOCK_RDLOCK(&obj->cob_lock);
 
-	D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank,
-		  "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank);
+	D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank,
+		  "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n",
+		  DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver);
 	D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank,
 		  "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank);
 
@@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	uint32_t			 tgt_size = 0;
 	uint32_t			 mbs_max_size;
 	uint32_t			 inline_size;
-	uint32_t			 flags = ORF_LEADER;
 	uint32_t			 leader = -1;
 	uint32_t			 len;
 	int				 rc;
@@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 		memcpy(dct, &tmp_tgt, sizeof(tmp_tgt));
 	}
 
+	/* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */
+	D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank,
+		  "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n",
+		  DP_OID(obj->cob_md.omd_id), shard->do_target_rank,
+		  coa->coa_dcts[0].dct_rank, coa->coa_dct_nr);
+
 	rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs);
 	if (rc < 0)
 		goto out;
@@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	if (rc != 0)
 		goto out;
 
+	auxi->flags = ORF_LEADER;
 	if (auxi->io_retry) {
-		flags |= ORF_RESEND;
+		auxi->flags |= ORF_RESEND;
 		/* Reset @enqueue_id if resend to new leader. */
 		if (spa->pa_auxi.target != shard->do_target_id)
 			spa->pa_auxi.enqueue_id = 0;
 	} else {
+		auxi->flags &= ~ORF_RESEND;
 		spa->pa_auxi.obj_auxi = auxi;
 		daos_dti_gen(&spa->pa_dti, false);
 	}
@@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	spa->pa_auxi.shard = shard->do_shard_idx;
 
 	if (obj_is_ec(obj))
-		flags |= ORF_EC;
+		auxi->flags |= ORF_EC;
 
 	mbs_max_size = sizeof(*mbs) + mbs->dm_data_size +
 		       sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz;
 
 	return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size,
 				       coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch,
-				       args->flags, flags, map_ver, &auxi->map_ver_reply, task);
+				       args->flags, auxi->flags, map_ver,
+				       &auxi->map_ver_reply, task);
 
 out:
 	if (rc > 0)
diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c
index 36c5c5f1e0c3..0c9dfc1418e5 100644
--- a/src/object/cli_shard.c
+++ b/src/object/cli_shard.c
@@ -1453,9 +1453,10 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data)
 
 	DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result,
 		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX "
-		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid),
-		  DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
-		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags);
+		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
+		  rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver,
+		  *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
+		  cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous");
 
 	crt_req_decref(rpc);
 
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index ec6aa3b817e3..cfe3385e959c 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -292,10 +292,16 @@ struct coll_oper_args {
 	struct shard_auxi_args	 coa_auxi;
 	int			 coa_dct_nr;
 	uint32_t		 coa_dct_cap;
-	uint32_t		 coa_max_dct_sz;
+	union {
+		/* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */
+		uint32_t	 coa_min_rank;
+		/* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */
+		uint32_t	 coa_max_dct_sz;
+	};
 	uint8_t			 coa_max_shard_nr;
 	uint8_t			 coa_max_bitmap_sz;
 	uint8_t			 coa_for_modify:1,
+				 coa_raw_sparse:1,
 				 coa_sparse:1;
 	uint8_t			 coa_target_nr;
 	/*
diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c
index 7bf0ef4aaf93..82d91c966acc 100644
--- a/src/object/obj_utils.c
+++ b/src/object/obj_utils.c
@@ -633,7 +633,7 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t
 		 * use the "cur_pos" as the relay engine.
 		 */
 		pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
-		if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
+		if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
 			memcpy(&tmp, &tgts[pos], sizeof(tmp));
 			memcpy(&tgts[pos], dct, sizeof(tmp));
 			memcpy(dct, &tmp, sizeof(tmp));
diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c
index 2a152b47bd64..5b59f954f865 100644
--- a/src/object/srv_coll.c
+++ b/src/object/srv_coll.c
@@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct
 	int			 i;
 	int			 j;
 
-	/* dcts[0] is for current engine. */
-	if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
-	    dcts[0].dct_shards == NULL) {
+	/* dcts[0] must be for current engine. */
+	if (unlikely(dcts[0].dct_rank != dss_self_rank())) {
+		D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n",
+			dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags);
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
+		     dcts[0].dct_shards == NULL)) {
 		D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n",
 			dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz,
 			dcts[0].dct_shards == NULL ? "empty" : "non-empty");
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index f220149a5326..041ea903c4fe 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -5615,12 +5615,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj "
 		DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti "
-		DF_DTI", forward width %u, forward depth %u\n",
+		DF_DTI", forward width %u, forward depth %u, flags %x\n",
 		(ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		(ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
 		rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
 		ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid),
-		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	D_ASSERT(dmi->dmi_xs_id != 0);
 
@@ -5757,13 +5757,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 	DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
 		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc "
 		  DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, "
-		  "tgt_nr %u, forward width %u, forward depth %u",
+		  "tgt_nr %u, forward width %u, forward depth %u, flags %x",
 		  (ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		  (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
 		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch,
 		  ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
 		  ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
-		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	obj_punch_complete(rpc, rc, max_ver);
 
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index f1d69b54f4e4..0f87bfd4d0e3 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -1973,9 +1973,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid,
 			rc = -DER_INPROGRESS;
 	}
 
-	if (rc < 0)
-		D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc));
-
 	return rc;
 }
 

From c4cf4f7b58f9315c2a294c899d807edb6b8917b0 Mon Sep 17 00:00:00 2001
From: Tom Nabarro <tom.nabarro@intel.com>
Date: Mon, 28 Oct 2024 13:49:15 +0000
Subject: [PATCH 24/26] DAOS-16687 control: Handle missing PCIe caps in storage
 query usage (#15296) (#15392)

Missing PCIe capabilities when querying a NVMe SSD's configuration
space is unusual but should be handled gracefully by the control-plane
and shouldn't cause a failure to return usage statistics when calling
dmg storage query usage.

Update so that pciutils lib is only called when attempting to display
health stats via dmg and not when fetching usage info. Improve clarity
of workflow to ease maintenance and add test coverage for updates.
Enable continued functionality when NVMe device doesn't return any
extended capabilities in PCIe configuration space data by adding
sentinel error to library for such a case.

Signed-off-by: Tom Nabarro <tom.nabarro@intel.com>
---
 .../common/proto/ctl/storage_nvme.pb.go       |  46 +++--
 src/control/lib/control/storage.go            |   4 +-
 src/control/lib/hardware/pciutils/bindings.go |   3 +-
 src/control/server/ctl_smd_rpc_test.go        |  49 +++++
 src/control/server/instance_storage_rpc.go    | 134 ++++++++++----
 .../server/instance_storage_rpc_test.go       | 172 ++++++++++++------
 src/proto/ctl/storage_nvme.proto              |   3 +-
 7 files changed, 308 insertions(+), 103 deletions(-)

diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go
index 62fede43ed4c..cb2dc5099d45 100644
--- a/src/control/common/proto/ctl/storage_nvme.pb.go
+++ b/src/control/common/proto/ctl/storage_nvme.pb.go
@@ -95,11 +95,12 @@ type ScanNvmeReq struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Health   bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`     // Retrieve NVMe device health statistics
-	Meta     bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`         // Retrieve metadata relating to NVMe device
-	Basic    bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`       // Strip NVMe device details to only basic
-	MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob
-	RdbSize  uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`   // Size of the RDB blob
+	Health    bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`       // Retrieve NVMe device health statistics
+	Meta      bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`           // Retrieve metadata relating to NVMe device
+	Basic     bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`         // Strip NVMe device details to only basic
+	MetaSize  uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"`   // Size of the metadata blob
+	RdbSize   uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`     // Size of the RDB blob
+	LinkStats bool   `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics
 }
 
 func (x *ScanNvmeReq) Reset() {
@@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 {
 	return 0
 }
 
+func (x *ScanNvmeReq) GetLinkStats() bool {
+	if x != nil {
+		return x.LinkStats
+	}
+	return false
+}
+
 type ScanNvmeResp struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53,
 	0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72,
 	0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08,
-	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
+	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
 	0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c,
 	0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
 	0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04,
@@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65,
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a,
 	0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65,
-	0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70,
-	0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b,
-	0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a,
-	0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63,
-	0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65,
-	0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61,
-	0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68,
-	0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63,
-	0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f,
-	0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20,
+	0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65,
+	0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b,
+	0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13,
+	0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73,
+	0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c,
+	0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05,
+	0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e,
+	0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
+	0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f,
+	0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74,
+	0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go
index 65942a6c12d2..9d5fe470de6a 100644
--- a/src/control/lib/control/storage.go
+++ b/src/control/lib/control/storage.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2020-2023 Intel Corporation.
+// (C) Copyright 2020-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe
 				// Health and meta details required to populate usage statistics.
 				Health: req.NvmeHealth || req.Usage,
 				Meta:   req.Usage,
+				// Only request link stats if health explicitly requested.
+				LinkStats: req.NvmeHealth,
 			},
 		})
 	})
diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go
index 86686896c80d..f4d43c50fa31 100644
--- a/src/control/lib/hardware/pciutils/bindings.go
+++ b/src/control/lib/hardware/pciutils/bindings.go
@@ -40,6 +40,7 @@ var (
 	ErrMultiDevices     = errors.New("want single device config got multiple")
 	ErrCfgNotTerminated = errors.New("device config content not new-line terminated")
 	ErrCfgMissing       = errors.New("incomplete device config")
+	ErrNoPCIeCaps       = errors.New("no pci-express capabilities found")
 )
 
 // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of
@@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro
 	var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL)
 
 	if cp == nil {
-		return errors.New("no pci-express capabilities found")
+		return ErrNoPCIeCaps
 	}
 
 	cpAddr := uint32(cp.addr)
diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go
index 6378e81d23cd..06f1276fa252 100644
--- a/src/control/server/ctl_smd_rpc_test.go
+++ b/src/control/server/ctl_smd_rpc_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/drpc"
 	"github.com/daos-stack/daos/src/control/lib/daos"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 		drpcResps      map[int][]*mockDrpcResponse
 		harnessStopped bool
 		ioStopped      bool
+		pciDevErr      error
 		expResp        *ctlpb.SmdQueryResp
 		expErr         error
 	}{
@@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			},
 			expErr: daos.FreeMemError,
 		},
+		"list-devices; with health; update link stats": {
+			req: &ctlpb.SmdQueryReq{
+				OmitPools:        true,
+				Rank:             uint32(ranklist.NilRank),
+				Uuid:             test.MockUUID(1),
+				IncludeBioHealth: true,
+			},
+			drpcResps: map[int][]*mockDrpcResponse{
+				0: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{pbNormDev(0)},
+						},
+					},
+				},
+				1: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{
+								func() *ctlpb.SmdDevice {
+									sd := pbFaultDev(1)
+									sd.Ctrlr.PciCfg = "ABCD"
+									return sd
+								}(),
+							},
+						},
+					},
+					{
+						Message: &ctlpb.BioHealthResp{
+							Temperature: 1000000,
+							TempWarn:    true,
+						},
+					},
+				},
+			},
+			// Prove mock link stats provider gets called when IncludeBioHealth
+			// flag is set and Ctrlr.PciCfg string is not empty.
+			pciDevErr: errors.New("link stats provider fail"),
+			expErr:    errors.New("link stats provider fail"),
+		},
 		"ambiguous UUID": {
 			req: &ctlpb.SmdQueryReq{
 				Rank: uint32(ranklist.NilRank),
@@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: tc.pciDevErr,
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+			}()
+
 			engineCount := len(tc.drpcResps)
 			if engineCount == 0 {
 				engineCount = 1
diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go
index fbca1e534a08..3097440f500e 100644
--- a/src/control/server/instance_storage_rpc.go
+++ b/src/control/server/instance_storage_rpc.go
@@ -23,14 +23,19 @@ import (
 	"github.com/daos-stack/daos/src/control/events"
 	"github.com/daos-stack/daos/src/control/fault"
 	"github.com/daos-stack/daos/src/control/lib/hardware"
-	"github.com/daos-stack/daos/src/control/lib/hardware/hwprov"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/server/storage"
 )
 
 var (
-	scanSmd                       = listSmdDevices
-	getCtrlrHealth                = getBioHealth
+	// Function pointers to enable mocking.
+	scanSmd       = listSmdDevices
+	scanHealth    = getBioHealth
+	linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+
+	// Sentinel errors to enable comparison.
 	errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance")
+	errCtrlrHealthSkipped         = errors.New("controller health update was skipped")
 )
 
 // newMntRet creates and populates SCM mount result.
@@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe
 }
 
 func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error {
+	if health == nil {
+		return errors.New("nil BioHealthResp")
+	}
+
 	// Convert byte-string to lspci-format.
 	sb := new(strings.Builder)
 	formatBytestring(pciCfg, sb)
 
 	pciDev := &hardware.PCIDevice{}
 	if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil {
-		return err
+		return errors.Wrap(err, "pciutils lib")
 	}
 
 	// Copy link details from PCIDevice to health stats.
@@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth
 		lastMaxWidthStr, lastWidthStr, stats.LinkPortId)
 }
 
-func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) {
-	stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)]
-	if !ctrlr.CanSupplyHealthStats() {
-		engine.Debugf("skip fetching health stats on device %q in %q state",
-			ctrlr.PciAddr, stateName)
-		return false, nil
+type ctrlrHealthReq struct {
+	meta          bool
+	engine        Engine
+	bhReq         *ctlpb.BioHealthReq
+	ctrlr         *ctlpb.NvmeController
+	linkStatsProv hardware.PCIeLinkStatsProvider
+}
+
+// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health
+// resp if in a bad state.
+func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) {
+	stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)]
+	if !req.ctrlr.CanSupplyHealthStats() {
+		req.engine.Debugf("skip fetching health stats on device %q in %q state",
+			req.ctrlr.PciAddr, stateName)
+		return nil, errCtrlrHealthSkipped
 	}
 
-	health, err := getCtrlrHealth(ctx, engine, req)
+	health, err := scanHealth(ctx, req.engine, req.bhReq)
 	if err != nil {
-		return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr,
+		return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr,
 			stateName)
 	}
 
-	if ctrlr.PciCfg != "" {
-		if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil {
-			return false, errors.Wrapf(err, "add link stats for %q", ctrlr)
+	return health, nil
+}
+
+// Add link state and capability information to input health statistics for the given controller
+// then if successful publish events based on link statistic changes. Link updated health stats to
+// controller.
+func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error {
+	err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health)
+	if err == nil {
+		publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health)
+	} else {
+		if errors.Cause(err) != pciutils.ErrNoPCIeCaps {
+			return errors.Wrapf(err, "add link stats for %q", req.ctrlr)
+		}
+		req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr)
+	}
+
+	return nil
+}
+
+// Update controller health statistics and include link info if required and available.
+func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) {
+	health, err := getCtrlrHealth(ctx, req)
+	if err != nil {
+		if err == errCtrlrHealthSkipped {
+			// Nothing to do.
+			return false, nil
 		}
-		publishLinkStatEvents(engine, ctrlr.PciAddr, health)
+		return false, errors.Wrap(err, "get ctrlr health")
+	}
+
+	if req.linkStatsProv == nil {
+		req.engine.Debugf("device %q skip adding link stats; nil provider",
+			req.ctrlr.PciAddr)
+	} else if req.ctrlr.PciCfg == "" {
+		req.engine.Debugf("device %q skip adding link stats; empty pci cfg",
+			req.ctrlr.PciAddr)
 	} else {
-		engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr)
+		if err = setCtrlrHealthWithLinkInfo(req, health); err != nil {
+			return false, errors.Wrap(err, "set ctrlr health")
+		}
 	}
 
-	ctrlr.HealthStats = health
-	ctrlr.PciCfg = ""
+	req.ctrlr.HealthStats = health
 	return true, nil
 }
 
@@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 
 		c := seenCtrlrs[addr]
 
-		// Only minimal info provided in standard scan to enable result aggregation across
-		// homogeneous hosts.
 		engineRank, err := engine.GetRank()
 		if err != nil {
 			return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index())
 		}
+
+		// Only provide minimal info in standard scan to enable result aggregation across
+		// homogeneous hosts.
 		nsd := &ctlpb.SmdDevice{
 			RoleBits:         sd.RoleBits,
 			CtrlrNamespaceId: sd.CtrlrNamespaceId,
@@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 		// Populate health if requested.
 		healthUpdated := false
 		if pbReq.Health && c.HealthStats == nil {
-			bhReq := &ctlpb.BioHealthReq{
-				DevUuid:  sd.Uuid,
-				MetaSize: pbReq.MetaSize,
-				RdbSize:  pbReq.RdbSize,
+			bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid}
+			if pbReq.Meta {
+				bhReq.MetaSize = pbReq.MetaSize
+				bhReq.RdbSize = pbReq.RdbSize
 			}
-			upd, err := populateCtrlrHealth(ctx, engine, bhReq, c,
-				hwprov.DefaultPCIeLinkStatsProvider())
+
+			chReq := ctrlrHealthReq{
+				engine: engine,
+				bhReq:  bhReq,
+				ctrlr:  c,
+			}
+			if pbReq.LinkStats {
+				// Add link stats to health if flag set.
+				chReq.linkStatsProv = linkStatsProv
+			}
+
+			healthUpdated, err = populateCtrlrHealth(ctx, chReq)
 			if err != nil {
 				return nil, err
 			}
-			healthUpdated = upd
 		}
+		// Used to update health with link stats, now redundant.
+		c.PciCfg = ""
 
 		// Populate usage data if requested.
 		if pbReq.Meta {
@@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq
 			continue // Skip health query if UUID doesn't match requested.
 		}
 		if pbReq.IncludeBioHealth {
-			bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid}
-			if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr,
-				hwprov.DefaultPCIeLinkStatsProvider()); err != nil {
+			chReq := ctrlrHealthReq{
+				engine:        engine,
+				bhReq:         &ctlpb.BioHealthReq{DevUuid: dev.Uuid},
+				ctrlr:         dev.Ctrlr,
+				linkStatsProv: linkStatsProv,
+			}
+
+			if _, err = populateCtrlrHealth(ctx, chReq); err != nil {
 				return nil, err
 			}
 		}
+		// Used to update health with link stats, now redundant.
+		dev.Ctrlr.PciCfg = ""
+
 		if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid {
 			rResp.Devices = []*ctlpb.SmdDevice{dev}
 			found = true
diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go
index b199adb6b8d3..0f72f739970b 100644
--- a/src/control/server/instance_storage_rpc_test.go
+++ b/src/control/server/instance_storage_rpc_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/events"
 	"github.com/daos-stack/daos/src/control/lib/hardware"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		badDevState    bool
-		noPciCfgSpc    bool
-		pciDev         *hardware.PCIDevice
-		pciDevErr      error
-		emptyHealthRes bool
-		healthErr      error
-		lastStats      map[string]*ctlpb.BioHealthResp
-		expCtrlr       *ctlpb.NvmeController
-		expNotUpdated  bool
-		expErr         error
-		expDispatched  []*events.RASEvent
-		expLastStats   map[string]*ctlpb.BioHealthResp
+		badDevState      bool
+		nilLinkStatsProv bool
+		noPciCfgSpc      bool
+		pciDev           *hardware.PCIDevice
+		pciDevErr        error
+		healthReq        *ctlpb.BioHealthReq
+		healthRes        *ctlpb.BioHealthResp
+		nilHealthRes     bool
+		healthErr        error
+		lastStats        map[string]*ctlpb.BioHealthResp
+		expCtrlr         *ctlpb.NvmeController
+		expNotUpdated    bool
+		expErr           error
+		expDispatched    []*events.RASEvent
+		expLastStats     map[string]*ctlpb.BioHealthResp
 	}{
 		"bad state; skip health": {
 			badDevState: true,
-			noPciCfgSpc: true,
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr: pciAddr,
+				PciCfg:  "ABCD",
 			},
 			expNotUpdated: true,
 		},
@@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 				HealthStats: healthWithLinkStats(0, 0, 0, 0),
 			},
 		},
+		"nil bio health response": {
+			nilHealthRes: true,
+			expErr:       errors.New("nil BioHealthResp"),
+		},
 		"empty bio health response; empty link stats": {
-			emptyHealthRes: true,
-			pciDev:         new(hardware.PCIDevice),
+			healthRes: new(ctlpb.BioHealthResp),
+			pciDev:    new(hardware.PCIDevice),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: new(ctlpb.BioHealthResp),
 			},
@@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			pciDevErr: errors.New("fail"),
 			expErr:    errors.New("fail"),
 		},
+		"update health; add link stats; pciutils lib error; missing pcie caps": {
+			pciDevErr: pciutils.ErrNoPCIeCaps,
+			expCtrlr: &ctlpb.NvmeController{
+				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
+				DevState:    ctlpb.NvmeDevState_NORMAL,
+				HealthStats: healthWithLinkStats(0, 0, 0, 0),
+			},
+		},
 		"update health; add link stats; normal link state; no event published": {
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(proto.MockNvmeHealth()),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8),
 			},
@@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4),
 			},
@@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4),
 			},
@@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4),
 			},
@@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
-			healthRes := healthWithLinkStats(0, 0, 0, 0)
-			if tc.emptyHealthRes {
-				healthRes = new(ctlpb.BioHealthResp)
-			}
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
-				return healthRes, tc.healthErr
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
 			}()
 
 			var devState ctlpb.NvmeDevState
@@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 					LinkMaxWidth: 4,
 				}
 			}
+			if tc.healthRes == nil && !tc.nilHealthRes {
+				tc.healthRes = healthWithLinkStats(0, 0, 0, 0)
+			}
 
-			mockProv := &mockPCIeLinkStatsProvider{
-				pciDev:    tc.pciDev,
-				pciDevErr: tc.pciDevErr,
+			var mockProv *mockPCIeLinkStatsProvider
+			if !tc.nilLinkStatsProv {
+				mockProv = &mockPCIeLinkStatsProvider{
+					pciDev:    tc.pciDev,
+					pciDevErr: tc.pciDevErr,
+				}
 			}
 
 			ctrlr := &ctlpb.NvmeController{
@@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			subscriber := newMockSubscriber(2)
 			ps.Subscribe(events.RASTypeInfoOnly, subscriber)
 
-			upd, err := populateCtrlrHealth(test.Context(t), ei,
-				&ctlpb.BioHealthReq{}, ctrlr, mockProv)
+			chReq := ctrlrHealthReq{
+				engine:        ei,
+				bhReq:         tc.healthReq,
+				ctrlr:         ctrlr,
+				linkStatsProv: mockProv,
+			}
+
+			upd, err := populateCtrlrHealth(test.Context(t), chReq)
 			test.CmpErr(t, tc.expErr, err)
 			if err != nil {
 				return
@@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 	}
 	defSmdScanRes := func() *ctlpb.SmdDevResp {
 		sd := proto.MockSmdDevice(c, 2)
+		sd.Rank = 2
 		return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}}
 	}
 	healthRespWithUsage := func() *ctlpb.BioHealthResp {
@@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 		mh.MetaWalSize, mh.RdbWalSize = 4, 5
 		return mh
 	}
+	ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController {
+		c := proto.MockNvmeController(2)
+		c.HealthStats = healthRespWithUsage()
+		sd := proto.MockSmdDevice(nil, 2)
+		sd.Rank = 1
+		sd.TotalBytes = c.HealthStats.TotalBytes
+		sd.AvailBytes = c.HealthStats.AvailBytes
+		sd.ClusterSize = c.HealthStats.ClusterSize
+		sd.MetaWalSize = c.HealthStats.MetaWalSize
+		sd.RdbWalSize = c.HealthStats.RdbWalSize
+		c.SmdDevices = []*ctlpb.SmdDevice{sd}
+		return c
+	}
 
 	for name, tc := range map[string]struct {
 		req                 ctlpb.ScanNvmeReq
@@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; usage and wal size reported": {
+		"scan over drpc; with meta and health; usage and wal size reported": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			rank:      1,
 			smdRes:    defSmdScanRes(),
 			healthRes: healthRespWithUsage(),
 			expResp: &ctlpb.ScanNvmeResp{
-				Ctrlrs: proto.NvmeControllers{
-					func() *ctlpb.NvmeController {
-						c := proto.MockNvmeController(2)
-						c.HealthStats = healthRespWithUsage()
-						sd := proto.MockSmdDevice(nil, 2)
-						sd.Rank = 1
-						sd.TotalBytes = c.HealthStats.TotalBytes
-						sd.AvailBytes = c.HealthStats.AvailBytes
-						sd.ClusterSize = c.HealthStats.ClusterSize
-						sd.MetaWalSize = c.HealthStats.MetaWalSize
-						sd.RdbWalSize = c.HealthStats.RdbWalSize
-						c.SmdDevices = []*ctlpb.SmdDevice{sd}
-						return c
-					}(),
-				},
-				State: new(ctlpb.ResponseState),
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
 			},
 		},
 		"scan over drpc; only ctrlrs with valid states shown": {
@@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; missing ctrlr in smd": {
+		"scan over drpc; with meta and health; missing ctrlr in smd": {
 			req: ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes: func() *ctlpb.SmdDevResp {
 				ssr := defSmdScanRes()
@@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("no ctrlr ref"),
 		},
-		"scan over drpc; with smd and health; health scan fails": {
+		"scan over drpc; with meta and health; health scan fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes:    defSmdScanRes(),
 			healthErr: errors.New("health scan failed"),
 			expErr:    errors.New("health scan failed"),
 		},
-		"scan over drpc; with smd and health; smd list fails": {
+		"scan over drpc; with meta and health; smd list fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdErr:    errors.New("smd scan failed"),
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("smd scan failed"),
 		},
-		"scan over drpc; with smd and health; nil smd list returned": {
+		"scan over drpc; with meta and health; nil smd list returned": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("nil smd scan resp"),
 		},
+		"scan over drpc; with meta and health; link info update skipped": {
+			req:  ctlpb.ScanNvmeReq{Meta: true, Health: true},
+			rank: 1,
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
+			},
+		},
+		"scan over drpc; with health; link info update run but failed": {
+			req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true},
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			// Prove link stat provider gets called when LinkStats flag set.
+			expErr: errors.New("link stats provider fail"),
+		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
@@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			defer func() {
 				scanSmd = listSmdDevices
 			}()
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
 				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
+			}()
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: errors.New("link stats provider fail"),
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
 			}()
 
 			if tc.provRes == nil {
@@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				ei.setSuperblock(nil)
 			} else {
 				ei.setSuperblock(&Superblock{
-					Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true,
+					Rank:      ranklist.NewRankPtr(uint32(tc.rank)),
+					ValidRank: true,
 				})
 			}
 
diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto
index 944d8e943bae..be068e5274d0 100644
--- a/src/proto/ctl/storage_nvme.proto
+++ b/src/proto/ctl/storage_nvme.proto
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -28,6 +28,7 @@ message ScanNvmeReq {
 	bool Basic = 3;		// Strip NVMe device details to only basic
 	uint64 MetaSize = 4;	// Size of the metadata blob
 	uint64 RdbSize = 5;	// Size of the RDB blob
+	bool   LinkStats = 6;     // Populate PCIe link info in health statistics
 }
 
 message ScanNvmeResp {

From eb95b5555016ccb40c6915ededd17943f652f1e0 Mon Sep 17 00:00:00 2001
From: wiliamhuang <lei.huang@intel.com>
Date: Mon, 28 Oct 2024 09:00:18 -0500
Subject: [PATCH 25/26] DAOS-16722 client: to intercept PMPI_Init() in
 libpil4dfs (#15387)

Intercept PMPI_Init() to avoid calling daos_init() if MPI_Init() is intercepted by other library (like darshan and mpip).

Signed-off-by: Lei Huang <lei.huang@intel.com>
---
 src/client/dfuse/pil4dfs/int_dfs.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index 3087cb09c75f..b69449f1e3d1 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -470,6 +470,7 @@ static int (*next_tcgetattr)(int fd, void *termios_p);
 /* end NOT supported by DAOS */
 
 static int (*next_mpi_init)(int *argc, char ***argv);
+static int (*next_pmpi_init)(int *argc, char ***argv);
 
 /* to do!! */
 /**
@@ -1041,6 +1042,22 @@ MPI_Init(int *argc, char ***argv)
 	return rc;
 }
 
+int
+PMPI_Init(int *argc, char ***argv)
+{
+	int rc;
+
+	if (next_pmpi_init == NULL) {
+		next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init");
+		D_ASSERT(next_pmpi_init != NULL);
+	}
+
+	atomic_fetch_add_relaxed(&mpi_init_count, 1);
+	rc = next_pmpi_init(argc, argv);
+	atomic_fetch_add_relaxed(&mpi_init_count, -1);
+	return rc;
+}
+
 /** determine whether a path (both relative and absolute) is on DAOS or not. If yes,
  *  returns parent object, item name, full path of parent dir, full absolute path, and
  *  the pointer to struct dfs_mt.

From bde13c310ef66e302d3bfb55af2b252a67386cb7 Mon Sep 17 00:00:00 2001
From: mjean308 <48688872+mjean308@users.noreply.github.com>
Date: Mon, 28 Oct 2024 11:25:47 -0400
Subject: [PATCH 26/26] DAOS-15943 test: Remove server logging from
 pre-teardown (#15282) (#15386)

Signed-off-by: Maureen Jean <maureen.jean@intel.com>
---
 src/tests/ftest/util/soak_test_base.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py
index f32e068cb167..2d1c11e722e9 100644
--- a/src/tests/ftest/util/soak_test_base.py
+++ b/src/tests/ftest/util/soak_test_base.py
@@ -25,8 +25,8 @@
 from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse,
                         create_app_cmdline, create_dm_cmdline, create_fio_cmdline,
                         create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline,
-                        create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers,
-                        get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot,
+                        create_racer_cmdline, ddhhmmss_format, get_harassers,
+                        launch_exclude_reintegrate, launch_extend, launch_reboot,
                         launch_server_stop_start, launch_snapshot, launch_vmd_identify_check,
                         reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check)
 
@@ -164,17 +164,6 @@ def pre_tear_down(self):
 
         # display final metrics
         run_metrics_check(self, prefix="final")
-        # Gather server logs
-        try:
-            get_daos_server_logs(self)
-        except SoakTestError as error:
-            errors.append(f"<<FAILED: Failed to gather server logs {error}>>")
-        # Gather journalctl logs
-        hosts = list(set(self.hostlist_servers))
-        since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time))
-        until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time))
-        for journalctl_type in ["kernel", "daos_server"]:
-            get_journalctl(self, hosts, since, until, journalctl_type, logging=True)
 
         if self.all_failed_harassers:
             errors.extend(self.all_failed_harassers)