diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 75ce018f41f..26ec596c860 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -59,3 +59,7 @@ src/pool/ @daos-stack/metadata-owners @daos-stack/metadata-watchers src/container/ @daos-stack/metadata-owners @daos-stack/metadata-watchers src/rdb/ @daos-stack/metadata-owners @daos-stack/metadata-watchers src/rsvc/ @daos-stack/metadata-owners @daos-stack/metadata-watchers + +# PRs that touch GitHub actions +.github/workflows/ @daos-stack/actions-watchers +.github/actions/ @daos-stack/actions-watchers diff --git a/.github/dependabot.yml b/.github/dependabot.yml index b9a7d8a8b6a..3c1996d1c04 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -11,3 +11,9 @@ updates: python-packages: patterns: - "*" + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + assignees: + - daos-stack/actions-watchers diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 9ffccd562aa..af3059c22fe 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -155,7 +155,7 @@ jobs: - name: Install extra python packages run: python3 -m pip install --requirement utils/cq/requirements.txt - name: Run check - uses: codespell-project/actions-codespell@3174815d6231f5bdc24dbfb6fc3b8caec73d521c # master + uses: codespell-project/actions-codespell@406322ec52dd7b488e48c1c4b82e2a8b3a1bf630 # master with: skip: ./src/control/vendor,./src/control/go.sum,./.git ignore_words_file: ci/codespell.ignores @@ -197,6 +197,20 @@ jobs: - name: Run check run: yamllint --format github . + copyright: + name: Copyright check + runs-on: ubuntu-24.04 + steps: + - name: Check out source repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + - name: Run check + run: | + \[ ! -x ./utils/cq/check_update_copyright.sh \] || ./utils/cq/check_update_copyright.sh \ + $(git merge-base HEAD ${{ github.event.pull_request.base.sha || github.ref }}) gha + linting-summary: name: Linting Summary runs-on: ubuntu-24.04 diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml index ead66309fba..5117f992ebd 100644 --- a/.github/workflows/ossf-scorecard.yml +++ b/.github/workflows/ossf-scorecard.yml @@ -71,6 +71,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard (optional). # Commenting out will disable upload of results to your repo's Code Scanning dashboard - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9 + uses: github/codeql-action/upload-sarif@babb554ede22fd5605947329c4d04d8e7a0b8155 # v3.27.7 with: sarif_file: results.sarif diff --git a/.github/workflows/rpm-build-and-test.yml b/.github/workflows/rpm-build-and-test.yml index e0d0142b6e2..6f33efc1ad0 100644 --- a/.github/workflows/rpm-build-and-test.yml +++ b/.github/workflows/rpm-build-and-test.yml @@ -103,7 +103,7 @@ jobs: LEAP15_VERSION: ${{ needs.Variables.outputs.LEAP15_VERSION }} PACKAGING_DIR: ${{ needs.Variables.outputs.PACKAGING_DIR }} COVFN_DISABLED: ${{ fromJSON(needs.Variables.outputs.COVFN_DISABLED) }} - RUN_GHA: ${{ fromJSON(inputs.run-gha) }} + RUN_GHA: ${{ inputs.run-gha && fromJSON(inputs.run-gha) || false }} Calc-functional-matrix: name: Calculate Functional Testing Matrix diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 89152cb1afa..45a2a0edd1e 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Run Trivy vulnerability scanner in filesystem mode (table format) - uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 + uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0 with: scan-type: 'fs' scan-ref: '.' @@ -51,15 +51,14 @@ jobs: sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml - name: Run Trivy vulnerability scanner in filesystem mode (sarif format) - uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 + uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0 with: scan-type: 'fs' scan-ref: '.' trivy-config: 'utils/trivy/trivy.yaml' - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a - # 3.25.15 (v3) + uses: github/codeql-action/upload-sarif@babb554ede22fd5605947329c4d04d8e7a0b8155 # v3.27.7 with: sarif_file: 'trivy-results.sarif' @@ -70,7 +69,7 @@ jobs: sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml - name: Run Trivy vulnerability scanner in filesystem mode (human readable format) - uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 + uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0 with: scan-type: 'fs' scan-ref: '.' diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index f5b9c8b03e9..2fa2dacb6c9 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -41,5 +41,5 @@ pip install /opt/daos/lib/daos/python/ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n -./utils/node_local_test.py --max-log-size 1700MiB --dfuse-dir /localhome/jenkins/ \ +./utils/node_local_test.py --max-log-size 1900MiB --dfuse-dir /localhome/jenkins/ \ --log-usage-save nltir.xml --log-usage-export nltr.json all diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md index bd6c2f0f3b0..40a3a9c513e 100644 --- a/docs/admin/pool_operations.md +++ b/docs/admin/pool_operations.md @@ -698,25 +698,25 @@ The example below shows a rebuild in progress and NVMe space allocated. Rebuild busy, 75 objs, 9722 recs ``` -After experiencing significant failures, the pool may retain some suspect +After experiencing significant failures, the pool may retain some "dead" engines that have been marked as DEAD by the SWIM protocol but were not excluded from the pool to prevent potential data inconsistency. An administrator can bring these engines back online by restarting them. The example below illustrates the -system’s status with suspect and disabled engines. +system’s status with dead and disabled engines. ```bash $ dmg pool query tank -t ``` NB: The --health-only/-t option is necessary to conduct pool health-related queries only. -This is important because suspect ranks may cause commands to hang and timeout so identifying +This is important because dead ranks may cause commands to hang and timeout so identifying and restarting them is a useful procedure. ```bash Pool 6f450a68-8c7d-4da9-8900-02691650f6a2, ntarget=8, disabled=2, leader=3, version=4, state=Degraded Pool health info: - Disabled ranks: 1 - - Suspect ranks: 2 + - Dead ranks: 2 - Rebuild busy, 0 objs, 0 recs ``` diff --git a/src/client/dfs/dfs_sys.c b/src/client/dfs/dfs_sys.c index 679d9411480..cbd1a4406fe 100644 --- a/src/client/dfs/dfs_sys.c +++ b/src/client/dfs/dfs_sys.c @@ -1318,6 +1318,12 @@ dfs_sys_remove_type(dfs_sys_t *dfs_sys, const char *path, bool force, return EINVAL; if (path == NULL) return EINVAL; + /* + * since we are not evicting child entries from the dfs sys cache in case of force removal + * of a dir, just disallow force removal if cache is enabled on the dfs sys mount. + */ + if (dfs_sys->hash && force) + return ENOTSUP; rc = sys_path_parse(dfs_sys, &sys_path, path); if (rc != 0) diff --git a/src/client/dfs/duns.c b/src/client/dfs/duns.c index 71ce57bf097..8b8ae332da2 100644 --- a/src/client/dfs/duns.c +++ b/src/client/dfs/duns.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1131,18 +1131,20 @@ duns_create_path(daos_handle_t poh, const char *path, struct duns_attr_t *attrp) int duns_link_cont(daos_handle_t poh, const char *cont, const char *path) { - daos_handle_t coh; - daos_prop_t *prop; - struct daos_prop_entry *entry; - daos_pool_info_t pinfo = {0}; - daos_cont_info_t cinfo = {0}; - daos_cont_layout_t type; - char pool_str[DAOS_UUID_STR_SIZE]; - char cont_str[DAOS_UUID_STR_SIZE]; - int len; - char str[DUNS_MAX_XATTR_LEN]; - char type_str[10]; - int rc, rc2; + daos_handle_t coh; + daos_prop_t *prop; + struct daos_prop_entry *entry; + daos_pool_info_t pinfo = {0}; + daos_cont_info_t cinfo = {0}; + daos_cont_layout_t type; + char pool_str[DAOS_UUID_STR_SIZE]; + char cont_str[DAOS_UUID_STR_SIZE]; + int len; + char str[DUNS_MAX_XATTR_LEN]; + char type_str[10]; + bool backend_dfuse = false; + int rc2; + int rc; if (path == NULL) { D_ERROR("Invalid path\n"); @@ -1155,7 +1157,7 @@ duns_link_cont(daos_handle_t poh, const char *cont, const char *path) return daos_der2errno(rc); } - rc = daos_cont_open(poh, cont, DAOS_COO_RO, &coh, &cinfo, NULL); + rc = daos_cont_open(poh, cont, DAOS_COO_RW, &coh, &cinfo, NULL); if (rc) { D_ERROR("daos_cont_open() failed "DF_RC"\n", DP_RC(rc)); return daos_der2errno(rc); @@ -1203,7 +1205,6 @@ duns_link_cont(daos_handle_t poh, const char *cont, const char *path) D_FREE(dir); D_GOTO(out_cont, rc = err); } - D_FREE(dir); #ifdef LUSTRE_INCLUDE if (fs.f_type == LL_SUPER_MAGIC) { rc = duns_link_lustre_path(pool_str, cont_str, type, path, mode); @@ -1218,6 +1219,7 @@ duns_link_cont(daos_handle_t poh, const char *cont, const char *path) if (rc == -1) { rc = errno; D_ERROR("Failed to create dir %s: %d (%s)\n", path, rc, strerror(rc)); + D_FREE(dir); D_GOTO(out_cont, rc); } @@ -1225,21 +1227,38 @@ duns_link_cont(daos_handle_t poh, const char *cont, const char *path) * to discover the user running dfuse. */ if (fs.f_type == FUSE_SUPER_MAGIC) { - struct stat finfo; - /* - * This next stat will cause dfuse to lookup the entry point and perform a - * container connect, therefore this data will be read from root of the new - * container, not the directory. - * - * TODO: This could call getxattr to verify success. - */ - rc = stat(path, &finfo); - if (rc) { - rc = errno; - D_ERROR("Failed to access container: %d (%s)\n", rc, strerror(rc)); + int fd; + struct dfuse_user_reply dur = {}; + + fd = open(dirp, O_RDONLY | O_DIRECTORY | O_NOFOLLOW); + if (fd == -1) { + int err = errno; + + DS_ERROR(err, "Dfuse open failed '%s", dirp); + D_FREE(dir); + D_GOTO(err_link, err); + } + + rc = ioctl(fd, DFUSE_IOCTL_DFUSE_USER, &dur); + close(fd); + if (rc == -1) { + int err = errno; + + DS_ERROR(err, "Dfuse ioctl failed %s", dirp); + D_FREE(dir); + D_GOTO(err_link, err); + } + + rc = duns_set_fuse_acl(dur.uid, coh); + if (rc != -DER_SUCCESS) { + DS_ERROR(rc, "Dfuse set acl failed %s", dirp); + D_FREE(dir); D_GOTO(err_link, rc); } + + backend_dfuse = true; } + D_FREE(dir); } else if (type != DAOS_PROP_CO_LAYOUT_UNKNOWN) { /** create a new file for other container types */ int fd; @@ -1304,11 +1323,33 @@ duns_link_cont(daos_handle_t poh, const char *cont, const char *path) } D_GOTO(err_link, rc); } + if (backend_dfuse) { + struct stat finfo; + /* + * This next stat will cause dfuse to lookup the entry point and perform a + * container connect, therefore this data will be read from root of the new + * container, not the directory. + * + * TODO: This could call getxattr to verify success. + */ + rc = stat(path, &finfo); + if (rc) { + rc = errno; + DS_ERROR(rc, "Failed to access container bind at '%s'", path); + goto err_link; + } + } out_cont: rc2 = daos_cont_close(coh, NULL); - if (rc == 0) - rc = rc2; + if (rc2 != -DER_SUCCESS) { + DL_ERROR(rc2, "failed to close container"); + if (rc2 == -DER_NOMEM) + /* Second close to properly handle fault injection */ + daos_cont_close(coh, NULL); + else if (rc == -DER_SUCCESS) + rc = daos_der2errno(rc2); + } return rc; err_link: if (type == DAOS_PROP_CO_LAYOUT_POSIX) diff --git a/src/control/cmd/daos/container.go b/src/control/cmd/daos/container.go index 11a35e9e319..e69c78a370e 100644 --- a/src/control/cmd/daos/container.go +++ b/src/control/cmd/daos/container.go @@ -251,6 +251,7 @@ type containerCreateCmd struct { Mode ConsModeFlag `long:"mode" short:"M" description:"DFS consistency mode"` ACLFile string `long:"acl-file" short:"A" description:"input file containing ACL"` Group ui.ACLPrincipalFlag `long:"group" short:"g" description:"group who will own the container (group@[domain])"` + Attrs ui.SetPropertiesFlag `long:"attrs" short:"a" description:"user-defined attributes (key:val[,key:val...])"` Args struct { Label string `positional-arg-name:"label"` } `positional-args:"yes"` @@ -305,11 +306,7 @@ func (cmd *containerCreateCmd) Execute(_ []string) (err error) { defer disconnectPool() var contID string - if cmd.Path != "" { - contID, err = cmd.contCreateUNS() - } else { - contID, err = cmd.contCreate() - } + contID, err = cmd.contCreate() if err != nil { return err } @@ -397,76 +394,50 @@ func (cmd *containerCreateCmd) contCreate() (string, error) { if err != nil { return "", err } + contID := cmd.contUUID.String() + cContID := C.CString(contID) + defer freeString(cContID) - var contID string - if cmd.contUUID == uuid.Nil { - contID = cmd.contLabel - } else { - contID = cmd.contUUID.String() - } - - cmd.Infof("Successfully created container %s", contID) - return contID, nil -} - -func (cmd *containerCreateCmd) contCreateUNS() (string, error) { - var dattr C.struct_duns_attr_t - - props, cleanupProps, err := cmd.getCreateProps() - if err != nil { - return "", err + cleanupContainer := func() { + rc := C.daos_cont_destroy(cmd.cPoolHandle, cContID, goBool2int(true), nil) + if err := daosError(rc); err != nil { + cmd.Noticef("Failed to clean-up container %v", err) + } } - defer cleanupProps() - dattr.da_props = props - if !cmd.Type.Set { - return "", errors.New("container type is required for UNS") - } - dattr.da_type = cmd.Type.Type + if len(cmd.Attrs.ParsedProps) != 0 { + attrs := make(attrList, 0, len(cmd.Attrs.ParsedProps)) + for key, val := range cmd.Attrs.ParsedProps { + attrs = append(attrs, &attribute{ + Name: key, + Value: []byte(val), + }) + } - if cmd.poolUUID != uuid.Nil { - poolUUIDStr := C.CString(cmd.poolUUID.String()) - defer freeString(poolUUIDStr) - C.uuid_parse(poolUUIDStr, &dattr.da_puuid[0]) - } - if cmd.contUUID != uuid.Nil { - contUUIDStr := C.CString(cmd.contUUID.String()) - defer freeString(contUUIDStr) - C.uuid_parse(contUUIDStr, &dattr.da_cuuid[0]) - } + if err := cmd.openContainer(C.DAOS_COO_RW); err != nil { + cleanupContainer() + return "", errors.Wrapf(err, "failed to open new container %s", contID) + } + defer cmd.closeContainer() - if cmd.ChunkSize.Set { - dattr.da_chunk_size = cmd.ChunkSize.Size - } - if cmd.ObjectClass.Set { - dattr.da_oclass_id = cmd.ObjectClass.Class - } - if cmd.DirObjectClass.Set { - dattr.da_dir_oclass_id = cmd.DirObjectClass.Class - } - if cmd.FileObjectClass.Set { - dattr.da_file_oclass_id = cmd.FileObjectClass.Class - } - if cmd.CHints != "" { - hint := C.CString(cmd.CHints) - defer freeString(hint) - C.strncpy(&dattr.da_hints[0], hint, C.DAOS_CONT_HINT_MAX_LEN-1) + if err := setDaosAttributes(cmd.cContHandle, contAttr, attrs); err != nil { + cleanupContainer() + return "", errors.Wrapf(err, "failed to set user attributes on new container %s", contID) + } } - cPath := C.CString(cmd.Path) - defer freeString(cPath) + if cmd.Path != "" { + cPath := C.CString(cmd.Path) + defer freeString(cPath) - dunsErrno := C.duns_create_path(cmd.cPoolHandle, cPath, &dattr) - rc := C.daos_errno2der(dunsErrno) - if err := daosError(rc); err != nil { - return "", errors.Wrapf(err, "duns_create_path() failed") + dunsErrno := C.duns_link_cont(cmd.cPoolHandle, cContID, cPath) + rc := C.daos_errno2der(dunsErrno) + if err := daosError(rc); err != nil { + cleanupContainer() + return "", errors.Wrapf(err, "duns_link_cont() failed") + } } - contID := C.GoString(&dattr.da_cont[0]) - cmd.contUUID, err = uuid.Parse(contID) - if err != nil { - cmd.contLabel = contID - } cmd.Infof("Successfully created container %s type %s", contID, cmd.Type.String()) return contID, nil } diff --git a/src/control/cmd/daos/health.go b/src/control/cmd/daos/health.go index dbeaacde0a0..61f1d1df142 100644 --- a/src/control/cmd/daos/health.go +++ b/src/control/cmd/daos/health.go @@ -100,7 +100,7 @@ func (cmd *healthCheckCmd) Execute([]string) error { }() queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines, - daos.PoolQueryOptionSuspectEngines) + daos.PoolQueryOptionDeadEngines) if pool.DisabledTargets > 0 { queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines) } @@ -111,7 +111,7 @@ func (cmd *healthCheckCmd) Execute([]string) error { } pool.EnabledRanks = tpi.EnabledRanks pool.DisabledRanks = tpi.DisabledRanks - pool.SuspectRanks = tpi.SuspectRanks + pool.DeadRanks = tpi.DeadRanks poolConts, err := listContainers(poolHdl) if err != nil { diff --git a/src/control/cmd/daos/pool.go b/src/control/cmd/daos/pool.go index 085a78b3b73..0066f3959b7 100644 --- a/src/control/cmd/daos/pool.go +++ b/src/control/cmd/daos/pool.go @@ -300,7 +300,7 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) ( var rl *C.d_rank_list_t = nil if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) || - queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) { + queryMask.HasOption(daos.PoolQueryOptionDeadEngines) { rlPtr = &rl } @@ -330,8 +330,8 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) ( if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) { poolInfo.DisabledRanks = rs } - if queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) { - poolInfo.SuspectRanks = rs + if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) { + poolInfo.DeadRanks = rs } } @@ -357,8 +357,8 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo poolInfo.EnabledRanks = poolInfo1.EnabledRanks case daos.PoolQueryOptionDisabledEngines: poolInfo.DisabledRanks = poolInfo1.DisabledRanks - case daos.PoolQueryOptionSuspectEngines: - poolInfo.SuspectRanks = poolInfo1.SuspectRanks + case daos.PoolQueryOptionDeadEngines: + poolInfo.DeadRanks = poolInfo1.DeadRanks } return nil } @@ -369,8 +369,8 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo firstOption = daos.PoolQueryOptionEnabledEngines } else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) { firstOption = daos.PoolQueryOptionDisabledEngines - } else if originalMask.HasOption(daos.PoolQueryOptionSuspectEngines) { - firstOption = daos.PoolQueryOptionSuspectEngines + } else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) { + firstOption = daos.PoolQueryOptionDeadEngines } // Perform the first query to get basic information @@ -382,7 +382,7 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo queryOptions := []string{ daos.PoolQueryOptionEnabledEngines, daos.PoolQueryOptionDisabledEngines, - daos.PoolQueryOptionSuspectEngines, + daos.PoolQueryOptionDeadEngines, } // Process each option sequentially diff --git a/src/control/cmd/daos/pretty/health.go b/src/control/cmd/daos/pretty/health.go index ee77cd72371..3d3f8ca035d 100644 --- a/src/control/cmd/daos/pretty/health.go +++ b/src/control/cmd/daos/pretty/health.go @@ -61,12 +61,12 @@ func printPoolHealth(out io.Writer, pi *daos.PoolInfo, verbose bool) { } var healthStrings []string - if pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 { - degStr := "Suspect" + if pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 { + deadStr := "Dead" if verbose { - degStr += fmt.Sprintf(" %s", pi.SuspectRanks) + deadStr += fmt.Sprintf(" %s", pi.DeadRanks) } - healthStrings = append(healthStrings, degStr) + healthStrings = append(healthStrings, deadStr) } if pi.DisabledTargets > 0 { degStr := "Degraded" diff --git a/src/control/cmd/daos/pretty/pool.go b/src/control/cmd/daos/pretty/pool.go index 861f8fb17b3..75e7d9d13e9 100644 --- a/src/control/cmd/daos/pretty/pool.go +++ b/src/control/cmd/daos/pretty/pool.go @@ -76,9 +76,9 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error { if pi.DisabledRanks.Count() > 0 { fmt.Fprintf(w, "- Disabled ranks: %s\n", pi.DisabledRanks) } - if pi.QueryMask.HasOption(daos.PoolQueryOptionSuspectEngines) && - pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 { - fmt.Fprintf(w, "- Suspect ranks: %s\n", pi.SuspectRanks) + if pi.QueryMask.HasOption(daos.PoolQueryOptionDeadEngines) && + pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 { + fmt.Fprintf(w, "- Dead ranks: %s\n", pi.DeadRanks) } if pi.Rebuild != nil { if pi.Rebuild.Status == 0 { diff --git a/src/control/cmd/daos/pretty/pool_test.go b/src/control/cmd/daos/pretty/pool_test.go index c362b4fea44..3d2c4b3256c 100644 --- a/src/control/cmd/daos/pretty/pool_test.go +++ b/src/control/cmd/daos/pretty/pool_test.go @@ -128,7 +128,7 @@ Pool space info: Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), }, - "normal response; suspect ranks": { + "normal response; dead ranks": { pi: &daos.PoolInfo{ QueryMask: daos.HealthOnlyPoolQueryMask, State: daos.PoolServiceStateDegraded, @@ -141,7 +141,7 @@ Pool space info: PoolLayoutVer: 1, UpgradeLayoutVer: 2, DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"), - SuspectRanks: ranklist.MustCreateRankSet("[2]"), + DeadRanks: ranklist.MustCreateRankSet("[2]"), Rebuild: &daos.PoolRebuildStatus{ State: daos.PoolRebuildStateBusy, Objects: 42, @@ -163,7 +163,7 @@ Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Disabled ranks: 0-1,3 -- Suspect ranks: 2 +- Dead ranks: 2 - Rebuild busy, 42 objs, 21 recs `, poolUUID.String()), }, diff --git a/src/control/common/proto/mgmt/pool.pb.go b/src/control/common/proto/mgmt/pool.pb.go index df23fa87504..cf9acf3370c 100644 --- a/src/control/common/proto/mgmt/pool.pb.go +++ b/src/control/common/proto/mgmt/pool.pb.go @@ -6,7 +6,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.33.0 +// protoc-gen-go v1.31.0 // protoc v3.5.0 // source: mgmt/pool.proto @@ -1859,7 +1859,7 @@ type PoolQueryResp struct { SvcReps []uint32 `protobuf:"varint,19,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"` // service replica ranks QueryMask uint64 `protobuf:"varint,20,opt,name=query_mask,json=queryMask,proto3" json:"query_mask,omitempty"` // Bitmask of pool query options used MemFileBytes uint64 `protobuf:"varint,21,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-pool accumulated value of memory file sizes - SuspectRanks string `protobuf:"bytes,22,opt,name=suspect_ranks,json=suspectRanks,proto3" json:"suspect_ranks,omitempty"` // optional set of suspect ranks + DeadRanks string `protobuf:"bytes,22,opt,name=dead_ranks,json=deadRanks,proto3" json:"dead_ranks,omitempty"` // optional set of dead ranks } func (x *PoolQueryResp) Reset() { @@ -2034,9 +2034,9 @@ func (x *PoolQueryResp) GetMemFileBytes() uint64 { return 0 } -func (x *PoolQueryResp) GetSuspectRanks() string { +func (x *PoolQueryResp) GetDeadRanks() string { if x != nil { - return x.SuspectRanks + return x.DeadRanks } return "" } @@ -3076,7 +3076,7 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x04, 0x52, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a, 0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, 0x53, 0x59, 0x10, - 0x02, 0x22, 0x8b, 0x06, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, + 0x02, 0x22, 0x85, 0x06, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, @@ -3121,108 +3121,108 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, - 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x73, 0x75, 0x73, 0x70, 0x65, - 0x63, 0x74, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x16, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, - 0x73, 0x75, 0x73, 0x70, 0x65, 0x63, 0x74, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x4a, 0x04, 0x08, 0x09, - 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, - 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, - 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, - 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, - 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x48, 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, - 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, - 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, - 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, + 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x64, 0x65, 0x61, 0x64, 0x5f, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x16, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x64, 0x65, 0x61, + 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, + 0x74, 0x61, 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, + 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x12, 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x09, 0x48, 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, + 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, + 0x75, 0x6d, 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, + 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, + 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, + 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, + 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, + 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, + 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, + 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, + 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, + 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, + 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, + 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, + 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, + 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, + 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, + 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, + 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, + 0x74, 0x69, 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, + 0x61, 0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, + 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, + 0x72, 0x61, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x22, 0x81, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, - 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, - 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, - 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, - 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, - 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, - 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, - 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, - 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, - 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, - 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, - 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, - 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, - 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, - 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, - 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, - 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, - 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, - 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, - 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, - 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, - 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, - 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, - 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, - 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x81, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, - 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, - 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, - 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, - 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, - 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, - 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, - 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, - 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, - 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, - 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, - 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72, 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, - 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, - 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, - 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, - 0x65, 0x22, 0x80, 0x03, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, - 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, - 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, - 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, - 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, - 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, - 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, - 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, - 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, - 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, - 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44, 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x53, - 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x56, - 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, - 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, - 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, - 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, - 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, - 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, - 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, - 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, 0x20, 0x03, - 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, - 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, 0x05, 0x69, - 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, 0x0a, 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, - 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, - 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56, 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a, 0x10, 0x50, - 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, - 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, - 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, - 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10, 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65, 0x67, 0x72, - 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, - 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, - 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, - 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, - 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, - 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, + 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, + 0x07, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, + 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, + 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, + 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, + 0x12, 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, + 0x66, 0x72, 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, + 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, + 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, + 0x52, 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0x80, 0x03, 0x0a, 0x13, + 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, + 0x6e, 0x66, 0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x0e, 0x32, 0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, + 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, + 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, + 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, + 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, + 0x61, 0x63, 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, + 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, + 0x61, 0x67, 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, + 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, + 0x22, 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, + 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, + 0x44, 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x53, 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, + 0x02, 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x56, 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, + 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, + 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, + 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, + 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, + 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, + 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, + 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, + 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, + 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, + 0x0a, 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, + 0x70, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, + 0x56, 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a, 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, + 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, + 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, + 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, + 0x10, 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65, 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, + 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, + 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, + 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, + 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x33, } var ( diff --git a/src/control/lib/control/pool_test.go b/src/control/lib/control/pool_test.go index 1627b64e237..8e019bee0ba 100644 --- a/src/control/lib/control/pool_test.go +++ b/src/control/lib/control/pool_test.go @@ -882,12 +882,12 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { ServiceLeader: 6, ServiceReplicas: []ranklist.Rank{0, 1, 2}, DisabledRanks: &ranklist.RankSet{}, - SuspectRanks: ranklist.MustCreateRankSet("[7,8,9]"), + DeadRanks: ranklist.MustCreateRankSet("[7,8,9]"), PoolLayoutVer: 7, UpgradeLayoutVer: 8, }, }, - exp: `{"query_mask":"disabled_engines,rebuild,suspect_engines","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"disabled_ranks":[],"suspect_ranks":[7,8,9],"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":0,"status":42}`, + exp: `{"query_mask":"dead_engines,disabled_engines,rebuild","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"disabled_ranks":[],"dead_ranks":[7,8,9],"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":0,"status":42}`, }, } { t.Run(name, func(t *testing.T) { @@ -929,7 +929,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) { }, }, "valid rankset": { - data: `{"enabled_ranks":"[0,1-3,5]","disabled_ranks":"[]","suspect_ranks":"[4]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000}`, + data: `{"enabled_ranks":"[0,1-3,5]","dead_ranks":"[4]","disabled_ranks":"[]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000}`, expResp: PoolQueryResp{ Status: 0, PoolInfo: daos.PoolInfo{ @@ -942,7 +942,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) { ServiceLeader: 6, EnabledRanks: ranklist.MustCreateRankSet("[0-3,5]"), DisabledRanks: &ranklist.RankSet{}, - SuspectRanks: ranklist.MustCreateRankSet("[4]"), + DeadRanks: ranklist.MustCreateRankSet("[4]"), PoolLayoutVer: 7, UpgradeLayoutVer: 8, MemFileBytes: 1000, @@ -1214,7 +1214,7 @@ func TestControl_PoolQuery(t *testing.T) { }, }, }, - "query succeeds suspect ranks": { + "query succeeds dead ranks": { mic: &MockInvokerConfig{ UnaryResponse: MockMSResponse("host1", nil, &mgmtpb.PoolQueryResp{ @@ -1248,7 +1248,7 @@ func TestControl_PoolQuery(t *testing.T) { MediaType: mgmtpb.StorageMediaType(daos.StorageMediaTypeNvme), }, }, - SuspectRanks: "[1,2,3,7]", + DeadRanks: "[1,2,3,7]", }, ), }, @@ -1284,7 +1284,7 @@ func TestControl_PoolQuery(t *testing.T) { MediaType: daos.StorageMediaTypeNvme, }, }, - SuspectRanks: ranklist.MustCreateRankSet("[1-3,7]"), + DeadRanks: ranklist.MustCreateRankSet("[1-3,7]"), }, }, }, diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go index 240790a9d1f..3d26b7a2d4e 100644 --- a/src/control/lib/daos/pool.go +++ b/src/control/lib/daos/pool.go @@ -77,7 +77,7 @@ type ( TierStats []*StorageUsageStats `json:"tier_stats"` EnabledRanks *ranklist.RankSet `json:"enabled_ranks,omitempty"` DisabledRanks *ranklist.RankSet `json:"disabled_ranks,omitempty"` - SuspectRanks *ranklist.RankSet `json:"suspect_ranks,omitempty"` + DeadRanks *ranklist.RankSet `json:"dead_ranks,omitempty"` PoolLayoutVer uint32 `json:"pool_layout_ver"` UpgradeLayoutVer uint32 `json:"upgrade_layout_ver"` MemFileBytes uint64 `json:"mem_file_bytes"` @@ -107,7 +107,7 @@ type ( const ( // DefaultPoolQueryMask defines the default pool query mask. - DefaultPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_ENGINES_SUSPECT)) + DefaultPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_ENGINES_DEAD)) // HealthOnlyPoolQueryMask defines the mask for health-only queries. HealthOnlyPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_SPACE)) @@ -119,8 +119,8 @@ const ( PoolQueryOptionEnabledEngines = "enabled_engines" // PoolQueryOptionDisabledEngines retrieves disabled engines as part of the pool query. PoolQueryOptionDisabledEngines = "disabled_engines" - // PoolQueryOptionSuspectEngines retrieves suspect engines as part of the pool query. - PoolQueryOptionSuspectEngines = "suspect_engines" + // PoolQueryOptionDeadEngines retrieves dead engines as part of the pool query. + PoolQueryOptionDeadEngines = "dead_engines" // PoolConnectFlagReadOnly indicates that the connection is read-only. PoolConnectFlagReadOnly = C.DAOS_PC_RO @@ -135,7 +135,7 @@ var poolQueryOptMap = map[C.int]string{ C.DPI_REBUILD_STATUS: PoolQueryOptionRebuild, C.DPI_ENGINES_ENABLED: PoolQueryOptionEnabledEngines, C.DPI_ENGINES_DISABLED: PoolQueryOptionDisabledEngines, - C.DPI_ENGINES_SUSPECT: PoolQueryOptionSuspectEngines, + C.DPI_ENGINES_DEAD: PoolQueryOptionDeadEngines, } func resolvePoolQueryOpt(name string) (C.int, error) { diff --git a/src/control/lib/daos/pool_test.go b/src/control/lib/daos/pool_test.go index 1b91b9f5520..8cf5f71312f 100644 --- a/src/control/lib/daos/pool_test.go +++ b/src/control/lib/daos/pool_test.go @@ -136,14 +136,14 @@ func TestDaos_PoolQueryMask(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { *pqm = HealthOnlyPoolQueryMask }), - expString: genOptsStr(PoolQueryOptionDisabledEngines, PoolQueryOptionRebuild, PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionDisabledEngines, PoolQueryOptionRebuild), }, "set query all=true": { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetAll() }), - expString: genOptsStr(PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, - PoolQueryOptionRebuild, PoolQueryOptionSpace, PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, + PoolQueryOptionRebuild, PoolQueryOptionSpace), }, "set query all=false": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -163,8 +163,8 @@ func TestDaos_PoolQueryMask(t *testing.T) { pqm.SetAll() pqm.ClearOptions(PoolQueryOptionSpace) }), - expString: genOptsStr(PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, - PoolQueryOptionRebuild, PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, + PoolQueryOptionRebuild), }, "set query space=false (already false)": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -183,8 +183,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { pqm.SetAll() pqm.ClearOptions(PoolQueryOptionRebuild) }), - expString: genOptsStr(PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, PoolQueryOptionSpace, - PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionDisabledEngines, PoolQueryOptionEnabledEngines, PoolQueryOptionSpace), }, "set query enabled_engines=true": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -197,8 +196,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { pqm.SetAll() pqm.ClearOptions(PoolQueryOptionEnabledEngines) }), - expString: genOptsStr(PoolQueryOptionDisabledEngines, PoolQueryOptionRebuild, PoolQueryOptionSpace, - PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionDisabledEngines, PoolQueryOptionRebuild, PoolQueryOptionSpace), }, "set query disabled_engines=true": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -211,8 +209,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { pqm.SetAll() pqm.ClearOptions(PoolQueryOptionDisabledEngines) }), - expString: genOptsStr(PoolQueryOptionEnabledEngines, PoolQueryOptionRebuild, PoolQueryOptionSpace, - PoolQueryOptionSuspectEngines), + expString: genOptsStr(PoolQueryOptionDeadEngines, PoolQueryOptionEnabledEngines, PoolQueryOptionRebuild, PoolQueryOptionSpace), }, } { t.Run(name, func(t *testing.T) { @@ -237,7 +234,7 @@ func TestDaos_PoolQueryMaskMarshalJSON(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetAll() }), - expJSON: []byte(`"disabled_engines,enabled_engines,rebuild,space,suspect_engines"`), + expJSON: []byte(`"dead_engines,disabled_engines,enabled_engines,rebuild,space"`), }, } { t.Run(name, func(t *testing.T) { @@ -267,7 +264,7 @@ func TestDaos_PoolQueryMaskUnmarshalJSON(t *testing.T) { }, "uint64 value": { testData: []byte("18446744073709551603"), - expString: "rebuild,space,suspect_engines", + expString: "dead_engines,rebuild,space", }, "string values": { testData: []byte("rebuild,disabled_engines"), diff --git a/src/include/daos/pool.h b/src/include/daos/pool.h index e51758e16b1..07e861bb54a 100644 --- a/src/include/daos/pool.h +++ b/src/include/daos/pool.h @@ -82,8 +82,9 @@ * Version 1 corresponds to 2.2 (aggregation optimizations) * Version 2 corresponds to 2.4 (dynamic evtree, checksum scrubbing) * Version 3 corresponds to 2.6 (root embedded values, pool service operations tracking KVS) + * Version 4 corresponds to 2.8 (SV gang allocation) */ -#define DAOS_POOL_GLOBAL_VERSION 3 +#define DAOS_POOL_GLOBAL_VERSION 4 int dc_pool_init(void); void dc_pool_fini(void); diff --git a/src/include/daos_pool.h b/src/include/daos_pool.h index a8ab2e6c6a2..24746a70ff7 100644 --- a/src/include/daos_pool.h +++ b/src/include/daos_pool.h @@ -162,8 +162,8 @@ enum daos_pool_info_bit { DPI_ENGINES_ENABLED = 1ULL << 2, /** true to include (in \a ranks) engines with some or all targets disabled (down). */ DPI_ENGINES_DISABLED = 1ULL << 3, - /** true to include (in \a ranks) suspect engines. */ - DPI_ENGINES_SUSPECT = 1ULL << 4, + /** true to include (in \a ranks) engines marked DEAD by SWIM. */ + DPI_ENGINES_DEAD = 1ULL << 4, /** query all above optional info */ DPI_ALL = -1, }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 34e5d180e07..9e2ca08c271 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -289,6 +289,7 @@ int dsc_pool_svc_update_target_state(uuid_t pool_uuid, d_rank_list_t *ranks, uin struct pool_target_addr_list *target_list, pool_comp_state_t state); +uint32_t ds_pool_get_vos_df_version_default(void); int ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, d_rank_list_t *target_addrs, int ndomains, uint32_t *domains, daos_prop_t *prop, d_rank_list_t **svc_addrs); @@ -308,7 +309,7 @@ int dsc_pool_svc_delete_acl(uuid_t pool_uuid, d_rank_list_t *ranks, uint64_t dea int dsc_pool_svc_query(uuid_t pool_uuid, d_rank_list_t *ps_ranks, uint64_t deadline, d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ranks, - d_rank_list_t **suspect_ranks, daos_pool_info_t *pool_info, + d_rank_list_t **dead_ranks, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver); int dsc_pool_svc_query_target(uuid_t pool_uuid, d_rank_list_t *ps_ranks, uint64_t deadline, d_rank_t rank, uint32_t tgt_idx, daos_target_info_t *ti); diff --git a/src/include/daos_srv/rebuild.h b/src/include/daos_srv/rebuild.h index 90596e64d4f..18c452ebe2d 100644 --- a/src/include/daos_srv/rebuild.h +++ b/src/include/daos_srv/rebuild.h @@ -67,6 +67,21 @@ typedef enum { DP_UUID((mqa)->pool_uuid), (mqa)->version, (mqa)->generation, RB_OP_STR((mqa)->rebuild_op) #define DP_RBF_MQA(mqa) DP_RB_MQA(mqa), (mqa)->leader_rank, (mqa)->leader_term +/* arguments for log rebuild identifier given a struct obj_migrate_in *omi */ +#define DP_RB_OMI(omi) \ + DP_UUID((omi)->om_pool_uuid), (omi)->om_version, (omi)->om_generation, \ + RB_OP_STR((omi)->om_opc) + +/* arguments for log rebuild identifier given a struct migrate_pool_tls *mpt */ +#define DP_RB_MPT(mpt) \ + DP_UUID((mpt)->mpt_pool_uuid), (mpt)->mpt_version, (mpt)->mpt_generation, \ + RB_OP_STR((mpt)->mpt_opc) + +/* arguments for log rebuild identifier given a struct migrate_one *mro */ +#define DP_RB_MRO(mro) \ + DP_UUID((mro)->mo_pool_uuid), (mro)->mo_pool_tls_version, (mro)->mo_generation, \ + RB_OP_STR((mro)->mo_opc) + int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph, uint32_t layout_version, struct pool_target_id_list *tgts, diff --git a/src/mgmt/pool.pb-c.c b/src/mgmt/pool.pb-c.c index f733660ccf4..e8ccc8a1d0d 100644 --- a/src/mgmt/pool.pb-c.c +++ b/src/mgmt/pool.pb-c.c @@ -3318,283 +3318,149 @@ const ProtobufCMessageDescriptor mgmt__pool_rebuild_status__descriptor = (ProtobufCMessageInit) mgmt__pool_rebuild_status__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[21] = -{ - { - "status", - 1, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_INT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, status), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "uuid", - 2, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, uuid), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "label", - 3, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, label), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "total_targets", - 4, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, total_targets), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "active_targets", - 5, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, active_targets), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "disabled_targets", - 6, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, disabled_targets), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "rebuild", - 7, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, rebuild), - &mgmt__pool_rebuild_status__descriptor, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "tier_stats", - 8, - PROTOBUF_C_LABEL_REPEATED, - PROTOBUF_C_TYPE_MESSAGE, - offsetof(Mgmt__PoolQueryResp, n_tier_stats), - offsetof(Mgmt__PoolQueryResp, tier_stats), - &mgmt__storage_usage_stats__descriptor, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "version", - 10, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, version), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "leader", - 11, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, leader), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "enabled_ranks", - 12, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, enabled_ranks), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "disabled_ranks", - 13, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, disabled_ranks), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "total_engines", - 14, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, total_engines), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "pool_layout_ver", - 15, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, pool_layout_ver), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "upgrade_layout_ver", - 16, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, upgrade_layout_ver), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "state", - 17, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, state), - &mgmt__pool_service_state__descriptor, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ldr", - 18, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT32, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, svc_ldr), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "svc_reps", - 19, - PROTOBUF_C_LABEL_REPEATED, - PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolQueryResp, n_svc_reps), - offsetof(Mgmt__PoolQueryResp, svc_reps), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "query_mask", - 20, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT64, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, query_mask), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "mem_file_bytes", - 21, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_UINT64, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, mem_file_bytes), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "suspect_ranks", - 22, - PROTOBUF_C_LABEL_NONE, - PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(Mgmt__PoolQueryResp, suspect_ranks), - NULL, - &protobuf_c_empty_string, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[21] = { + { + "status", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, status), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "uuid", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "label", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, label), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "total_targets", 4, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, total_targets), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "active_targets", 5, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, active_targets), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "disabled_targets", 6, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, disabled_targets), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "rebuild", 7, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, rebuild), &mgmt__pool_rebuild_status__descriptor, NULL, + 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "tier_stats", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, + offsetof(Mgmt__PoolQueryResp, n_tier_stats), offsetof(Mgmt__PoolQueryResp, tier_stats), + &mgmt__storage_usage_stats__descriptor, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "version", 10, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, version), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "leader", 11, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, leader), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "enabled_ranks", 12, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, enabled_ranks), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "disabled_ranks", 13, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, disabled_ranks), NULL, &protobuf_c_empty_string, + 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "total_engines", 14, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, total_engines), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "pool_layout_ver", 15, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, pool_layout_ver), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "upgrade_layout_ver", 16, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, upgrade_layout_ver), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "state", 17, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_ENUM, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, state), &mgmt__pool_service_state__descriptor, NULL, + 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ldr", 18, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, svc_ldr), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "svc_reps", 19, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolQueryResp, n_svc_reps), offsetof(Mgmt__PoolQueryResp, svc_reps), NULL, + NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "query_mask", 20, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT64, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, query_mask), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "mem_file_bytes", 21, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT64, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, mem_file_bytes), NULL, NULL, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, + { + "dead_ranks", 22, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, dead_ranks), NULL, &protobuf_c_empty_string, 0, /* flags */ + 0, NULL, NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_query_resp__field_indices_by_name[] = { - 4, /* field[4] = active_targets */ - 11, /* field[11] = disabled_ranks */ - 5, /* field[5] = disabled_targets */ - 10, /* field[10] = enabled_ranks */ - 2, /* field[2] = label */ - 9, /* field[9] = leader */ - 19, /* field[19] = mem_file_bytes */ - 13, /* field[13] = pool_layout_ver */ - 18, /* field[18] = query_mask */ - 6, /* field[6] = rebuild */ - 15, /* field[15] = state */ - 0, /* field[0] = status */ - 20, /* field[20] = suspect_ranks */ - 16, /* field[16] = svc_ldr */ - 17, /* field[17] = svc_reps */ - 7, /* field[7] = tier_stats */ - 12, /* field[12] = total_engines */ - 3, /* field[3] = total_targets */ - 14, /* field[14] = upgrade_layout_ver */ - 1, /* field[1] = uuid */ - 8, /* field[8] = version */ + 4, /* field[4] = active_targets */ + 20, /* field[20] = dead_ranks */ + 11, /* field[11] = disabled_ranks */ + 5, /* field[5] = disabled_targets */ + 10, /* field[10] = enabled_ranks */ + 2, /* field[2] = label */ + 9, /* field[9] = leader */ + 19, /* field[19] = mem_file_bytes */ + 13, /* field[13] = pool_layout_ver */ + 18, /* field[18] = query_mask */ + 6, /* field[6] = rebuild */ + 15, /* field[15] = state */ + 0, /* field[0] = status */ + 16, /* field[16] = svc_ldr */ + 17, /* field[17] = svc_reps */ + 7, /* field[7] = tier_stats */ + 12, /* field[12] = total_engines */ + 3, /* field[3] = total_targets */ + 14, /* field[14] = upgrade_layout_ver */ + 1, /* field[1] = uuid */ + 8, /* field[8] = version */ }; static const ProtobufCIntRange mgmt__pool_query_resp__number_ranges[2 + 1] = { diff --git a/src/mgmt/pool.pb-c.h b/src/mgmt/pool.pb-c.h index d7468ff90d0..a6409601cad 100644 --- a/src/mgmt/pool.pb-c.h +++ b/src/mgmt/pool.pb-c.h @@ -874,9 +874,9 @@ struct _Mgmt__PoolQueryResp */ uint64_t mem_file_bytes; /* - * optional set of suspect ranks + * optional set of dead ranks */ - char *suspect_ranks; + char *dead_ranks; }; #define MGMT__POOL_QUERY_RESP__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_resp__descriptor) \ diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index a2f7005db45..fb9712bbe8b 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1769,10 +1769,10 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) d_rank_list_t *svc_ranks = NULL; d_rank_list_t *enabled_ranks = NULL; d_rank_list_t *disabled_ranks = NULL; - d_rank_list_t *suspect_ranks = NULL; + d_rank_list_t *dead_ranks = NULL; char *enabled_ranks_str = NULL; char *disabled_ranks_str = NULL; - char *suspect_ranks_str = NULL; + char *dead_ranks_str = NULL; size_t len; uint8_t *body; @@ -1796,7 +1796,7 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_GOTO(error, rc = -DER_NOMEM); pool_info.pi_bits = req->query_mask; - rc = ds_mgmt_pool_query(uuid, svc_ranks, &enabled_ranks, &disabled_ranks, &suspect_ranks, + rc = ds_mgmt_pool_query(uuid, svc_ranks, &enabled_ranks, &disabled_ranks, &dead_ranks, &pool_info, &resp.pool_layout_ver, &resp.upgrade_layout_ver); if (rc != 0) { DL_ERROR(rc, DF_UUID ": Failed to query the pool", DP_UUID(uuid)); @@ -1819,18 +1819,17 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) DP_UUID(uuid)); D_GOTO(error, rc); } - rc = d_rank_list_to_str(suspect_ranks, &suspect_ranks_str); + rc = d_rank_list_to_str(dead_ranks, &dead_ranks_str); if (rc != 0) { - DL_ERROR(rc, DF_UUID ": Failed to serialize the list of suspect ranks", - DP_UUID(uuid)); + DL_ERROR(rc, DF_UUID ": Failed to serialize the list of dead ranks", DP_UUID(uuid)); D_GOTO(error, rc); } if (disabled_ranks_str != NULL) D_DEBUG(DB_MGMT, DF_UUID ": list of disabled ranks: %s\n", DP_UUID(uuid), disabled_ranks_str); - if (suspect_ranks_str != NULL) - D_DEBUG(DB_MGMT, DF_UUID ": list of suspect ranks: %s\n", DP_UUID(uuid), - suspect_ranks_str); + if (dead_ranks_str != NULL) + D_DEBUG(DB_MGMT, DF_UUID ": list of dead ranks: %s\n", DP_UUID(uuid), + dead_ranks_str); /* Populate the response */ resp.query_mask = pool_info.pi_bits; @@ -1847,8 +1846,8 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) resp.enabled_ranks = enabled_ranks_str; if (disabled_ranks_str != NULL) resp.disabled_ranks = disabled_ranks_str; - if (suspect_ranks_str != NULL) - resp.suspect_ranks = suspect_ranks_str; + if (dead_ranks_str != NULL) + resp.dead_ranks = dead_ranks_str; D_ALLOC_ARRAY(resp.tier_stats, DAOS_MEDIA_MAX); if (resp.tier_stats == NULL) @@ -1893,8 +1892,8 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_FREE(enabled_ranks_str); d_rank_list_free(disabled_ranks); D_FREE(disabled_ranks_str); - d_rank_list_free(suspect_ranks); - D_FREE(suspect_ranks_str); + d_rank_list_free(dead_ranks); + D_FREE(dead_ranks_str); d_rank_list_free(svc_ranks); pool_query_free_tier_stats(&resp); } diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 72e8034949a..736c5e1878e 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -121,7 +121,7 @@ int ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, uint64_t *ncontainers); int ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **suspect_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver); int ds_mgmt_pool_query_targets(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_t rank, diff --git a/src/mgmt/srv_pool.c b/src/mgmt/srv_pool.c index 1c28b867ce2..6059c20de73 100644 --- a/src/mgmt/srv_pool.c +++ b/src/mgmt/srv_pool.c @@ -389,6 +389,7 @@ ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, * \param[in] svc_ranks Ranks of pool svc replicas. * \param[out] enabled_ranks Optional, returned storage ranks with enabled targets. * \param[out] disabled_ranks Optional, returned storage ranks with disabled targets. + * \param[out] dead_ranks Optional, returned storage ranks marked DEAD by SWIM. * \param[in][out] pool_info Query results * \param[in][out] pool_layout_ver Pool global version * \param[in][out] upgrade_layout_ver Latest pool global version this pool might be upgraded @@ -399,7 +400,7 @@ ds_mgmt_pool_list_cont(uuid_t uuid, d_rank_list_t *svc_ranks, */ int ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **suspect_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver) { @@ -411,7 +412,7 @@ ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **e D_DEBUG(DB_MGMT, "Querying pool "DF_UUID"\n", DP_UUID(pool_uuid)); return dsc_pool_svc_query(pool_uuid, svc_ranks, mgmt_ps_call_deadline(), enabled_ranks, - disabled_ranks, suspect_ranks, pool_info, pool_layout_ver, + disabled_ranks, dead_ranks, pool_info, pool_layout_ver, upgrade_layout_ver); } diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index fa54a05b529..ba8c78a6b7d 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -739,7 +739,7 @@ tgt_vos_create_one(void *varg) rc = vos_pool_create(path, (unsigned char *)vpa->vpa_uuid, vpa->vpa_scm_size, vpa->vpa_nvme_size, vpa->vpa_meta_size, 0 /* flags */, - 0 /* version */, NULL); + ds_pool_get_vos_df_version_default(), NULL); if (rc) D_ERROR(DF_UUID": failed to init vos pool %s: %d\n", DP_UUID(vpa->vpa_uuid), path, rc); diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index bbf637264db..cc597451225 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -281,11 +281,11 @@ daos_pool_info_t ds_mgmt_pool_query_info_in; void *ds_mgmt_pool_query_info_ptr; d_rank_list_t *ds_mgmt_pool_query_enabled_ranks_out; d_rank_list_t *ds_mgmt_pool_query_disabled_ranks_out; -d_rank_list_t *ds_mgmt_pool_query_suspect_ranks_out; +d_rank_list_t *ds_mgmt_pool_query_dead_ranks_out; int ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **enabled_ranks, - d_rank_list_t **disabled_ranks, d_rank_list_t **suspect_ranks, + d_rank_list_t **disabled_ranks, d_rank_list_t **dead_ranks, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver) { @@ -312,11 +312,11 @@ ds_mgmt_pool_query(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t **e ds_mgmt_pool_query_disabled_ranks_out = *disabled_ranks; } - if ((pool_info->pi_bits & DPI_ENGINES_SUSPECT) != 0) { - D_ASSERT(suspect_ranks != NULL); + if ((pool_info->pi_bits & DPI_ENGINES_DEAD) != 0) { + D_ASSERT(dead_ranks != NULL); - *suspect_ranks = d_rank_list_alloc(2); /* 0-1 ; caller must free this */ - ds_mgmt_pool_query_suspect_ranks_out = *suspect_ranks; + *dead_ranks = d_rank_list_alloc(2); /* 0-1 ; caller must free this */ + ds_mgmt_pool_query_dead_ranks_out = *dead_ranks; } ds_mgmt_pool_query_info_in = *pool_info; @@ -334,7 +334,7 @@ mock_ds_mgmt_pool_query_setup(void) memset(&ds_mgmt_pool_query_info_out, 0, sizeof(daos_pool_info_t)); ds_mgmt_pool_query_enabled_ranks_out = NULL; ds_mgmt_pool_query_disabled_ranks_out = NULL; - ds_mgmt_pool_query_suspect_ranks_out = NULL; + ds_mgmt_pool_query_dead_ranks_out = NULL; } int ds_mgmt_pool_query_targets_return; diff --git a/src/mgmt/tests/mocks.h b/src/mgmt/tests/mocks.h index 2ecc03d8ed9..ea93e7199b1 100644 --- a/src/mgmt/tests/mocks.h +++ b/src/mgmt/tests/mocks.h @@ -110,7 +110,7 @@ extern daos_pool_info_t ds_mgmt_pool_query_info_out; extern void *ds_mgmt_pool_query_info_ptr; extern d_rank_list_t *ds_mgmt_pool_query_enabled_ranks_out; extern d_rank_list_t *ds_mgmt_pool_query_disabled_ranks_out; -extern d_rank_list_t *ds_mgmt_pool_query_suspect_ranks_out; +extern d_rank_list_t *ds_mgmt_pool_query_dead_ranks_out; void mock_ds_mgmt_pool_query_setup(void); diff --git a/src/mgmt/tests/srv_drpc_tests.c b/src/mgmt/tests/srv_drpc_tests.c index a0682af6330..7c7c5e26c70 100644 --- a/src/mgmt/tests/srv_drpc_tests.c +++ b/src/mgmt/tests/srv_drpc_tests.c @@ -1408,7 +1408,7 @@ test_drpc_pool_query_success(void **state) Drpc__Response resp = DRPC__RESPONSE__INIT; uuid_t exp_uuid; daos_pool_info_t exp_info = {0}; - uint64_t flags = DPI_ENGINES_ENABLED | DPI_ENGINES_DISABLED | DPI_ENGINES_SUSPECT; + uint64_t flags = DPI_ENGINES_ENABLED | DPI_ENGINES_DISABLED | DPI_ENGINES_DEAD; init_test_pool_info(&exp_info); init_test_rebuild_status(&exp_info.pi_rebuild_st); @@ -1425,7 +1425,7 @@ test_drpc_pool_query_success(void **state) assert_non_null(ds_mgmt_pool_query_info_ptr); assert_non_null(ds_mgmt_pool_query_enabled_ranks_out); assert_non_null(ds_mgmt_pool_query_disabled_ranks_out); - assert_non_null(ds_mgmt_pool_query_suspect_ranks_out); + assert_non_null(ds_mgmt_pool_query_dead_ranks_out); flags |= DEFAULT_QUERY_BITS; assert_int_equal(ds_mgmt_pool_query_info_in.pi_bits, DEFAULT_QUERY_BITS | flags); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 649f236e829..61a82301367 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -87,6 +87,7 @@ struct migrate_one { uint32_t mo_generation; d_list_t mo_list; d_iov_t mo_csum_iov; + uint32_t mo_opc; }; struct migrate_obj_key { @@ -305,8 +306,8 @@ obj_tree_insert(daos_handle_t toh, uuid_t co_uuid, uint64_t tgt_id, daos_unit_oi d_iov_set(&key_iov, &oid, sizeof(oid)); rc = dbtree_lookup(cont_root->root_hdl, &key_iov, val_iov); if (rc == 0) { - D_DEBUG(DB_TRACE, DF_UOID"/"DF_UUID" already exits\n", - DP_UOID(oid), DP_UUID(co_uuid)); + D_DEBUG(DB_TRACE, DF_UOID "/" DF_UUID " already exists\n", DP_UOID(oid), + DP_UUID(co_uuid)); return -DER_EXIST; } @@ -386,8 +387,7 @@ migrate_pool_tls_destroy(struct migrate_pool_tls *tls) if (tls->mpt_dkey_ult_cnts) D_FREE(tls->mpt_dkey_ult_cnts); d_list_del(&tls->mpt_list); - D_DEBUG(DB_REBUILD, "TLS destroy for "DF_UUID" ver %d\n", - DP_UUID(tls->mpt_pool_uuid), tls->mpt_version); + D_DEBUG(DB_REBUILD, DF_RB ": TLS destroy\n", DP_RB_MPT(tls)); if (tls->mpt_pool) ds_pool_child_put(tls->mpt_pool); if (tls->mpt_svc_list.rl_ranks) @@ -559,10 +559,9 @@ migrate_pool_tls_create_one(void *data) D_GOTO(out, rc); } - D_DEBUG(DB_REBUILD, "TLS %p create for "DF_UUID" "DF_UUID"/"DF_UUID - " ver %d "DF_RC"\n", pool_tls, DP_UUID(pool_tls->mpt_pool_uuid), - DP_UUID(arg->pool_hdl_uuid), DP_UUID(arg->co_hdl_uuid), - arg->version, DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB ": TLS %p create for hdls " DF_UUID "/" DF_UUID " " DF_RC "\n", + DP_RB_MPT(pool_tls), pool_tls, DP_UUID(arg->pool_hdl_uuid), + DP_UUID(arg->co_hdl_uuid), DP_RC(rc)); d_list_add(&pool_tls->mpt_list, &tls->ot_pool_list); out: if (rc && pool_tls) @@ -655,8 +654,8 @@ migrate_pool_tls_lookup_create(struct ds_pool *pool, unsigned int version, unsig PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, migrate_pool_tls_create_one, &arg, 0); if (rc != 0) { - D_ERROR(DF_UUID": failed to create migrate tls: "DF_RC"\n", - DP_UUID(pool->sp_uuid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": failed to create migrate tls on tgt xstreams", + DP_RB_MPT(tls)); D_GOTO(out, rc); } @@ -670,8 +669,7 @@ migrate_pool_tls_lookup_create(struct ds_pool *pool, unsigned int version, unsig ABT_cond_broadcast(tls->mpt_init_cond); ABT_mutex_unlock(tls->mpt_init_mutex); } - D_DEBUG(DB_TRACE, "create tls "DF_UUID": "DF_RC"\n", - DP_UUID(pool->sp_uuid), DP_RC(rc)); + D_DEBUG(DB_TRACE, "create tls " DF_UUID ": " DF_RC "\n", DP_UUID(pool->sp_uuid), DP_RC(rc)); if (rc != 0) { if (tls != NULL) @@ -720,8 +718,8 @@ mrone_recx_daos_vos_internal(struct migrate_one *mrone, bool daos2vos, int shard stripe_nr, cell_nr, shard); - D_DEBUG(DB_REBUILD, "j %d k %d "DF_U64"/"DF_U64"\n", - j, k, recx->rx_idx, recx->rx_nr); + D_DEBUG(DB_REBUILD, DF_RB ": j %d k %d " DF_U64 "/" DF_U64 "\n", + DP_RB_MRO(mrone), j, k, recx->rx_idx, recx->rx_nr); } } } @@ -756,8 +754,7 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_ /* If pool map does not change, then let's retry for timeout, instead of * fail out. */ - D_WARN(DF_UUID" retry "DF_UOID" "DF_RC"\n", - DP_UUID(tls->mpt_pool_uuid), DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_WARN(rc, DF_RB ": retry " DF_UOID, DP_RB_MPT(tls), DP_UOID(mrone->mo_oid)); D_GOTO(retry, rc); } @@ -775,8 +772,7 @@ mrone_obj_fetch(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_t *sgls, tls = migrate_pool_tls_lookup(mrone->mo_pool_uuid, mrone->mo_pool_tls_version, mrone->mo_generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(mrone->mo_pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(mrone->mo_pool_uuid)); D_GOTO(out, rc = -DER_SHUTDOWN); } @@ -822,14 +818,14 @@ migrate_csum_calc(struct daos_csummer *csummer, struct migrate_one *mrone, daos_ int rc; if (daos_oclass_is_ec(&mrone->mo_oca)) { - D_DEBUG(DB_CSUM, DF_C_UOID_DKEY" REBUILD: Calculating csums. IOD count: %d\n", - DP_C_UOID_DKEY(mrone->mo_oid, &mrone->mo_dkey), iod_num); + D_DEBUG(DB_CSUM, DF_RB ": " DF_C_UOID_DKEY ": Calculating csums. IOD count: %d\n", + DP_RB_MRO(mrone), DP_C_UOID_DKEY(mrone->mo_oid, &mrone->mo_dkey), iod_num); rc = daos_csummer_calc_iods(csummer, sgls, iods, NULL, iod_num, false, NULL, -1, iod_csums); return rc; } - D_DEBUG(DB_CSUM, DF_C_UOID_DKEY" REBUILD: Using packed csums\n", + D_DEBUG(DB_CSUM, DF_RB ": " DF_C_UOID_DKEY ": Using packed csums\n", DP_RB_MRO(mrone), DP_C_UOID_DKEY(mrone->mo_oid, &mrone->mo_dkey)); /** make a copy of the iov because it will be modified while * iterating over the csums @@ -840,7 +836,7 @@ migrate_csum_calc(struct daos_csummer *csummer, struct migrate_one *mrone, daos_ rc = daos_csummer_alloc_iods_csums_with_packed(csummer, iods, iod_num, p_csum_iov, iod_csums); if (rc != 0) - D_ERROR("Failed to alloc iod csums: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": failed to alloc iod csums", DP_RB_MRO(mrone)); return rc; } @@ -893,9 +889,10 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, } } - D_DEBUG(DB_REBUILD, DF_UOID " mrone %p dkey " DF_KEY " nr %d eph " DF_U64 " fetch %s\n", - DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), mrone->mo_iod_num, - mrone->mo_epoch, fetch ? "yes" : "no"); + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " mrone %p dkey " DF_KEY " nr %d eph " DF_U64 " fetch %s\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), + mrone->mo_iod_num, mrone->mo_epoch, fetch ? "yes" : "no"); if (DAOS_FAIL_CHECK(DAOS_REBUILD_NO_UPDATE)) return 0; @@ -916,7 +913,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, mrone->mo_epoch, DIOF_FOR_MIGRATION, p_csum_iov); if (rc) { - D_ERROR("mrone_obj_fetch "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": mrone_obj_fetch", DP_RB_MRO(mrone)); D_GOTO(out, rc); } } @@ -937,16 +934,17 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, /* skip empty record */ if (iod_cnt == 0) { - D_DEBUG(DB_TRACE, "i %d iod_size = 0\n", i); + D_DEBUG(DB_TRACE, DF_RB ": i %d iod_size = 0\n", DP_RB_MRO(mrone), i); continue; } - D_DEBUG(DB_TRACE, "update start %d cnt %d\n", start, iod_cnt); + D_DEBUG(DB_TRACE, DF_RB ": update start %d cnt %d\n", DP_RB_MRO(mrone), start, + iod_cnt); rc = migrate_csum_calc(csummer, mrone, &iods[start], iod_cnt, &sgls[start], fetch ? &csum_iov : &mrone->mo_csum_iov, &iod_csums); if (rc != 0) { - D_ERROR("Error calculating checksums: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": error calculating checksums", DP_RB_MRO(mrone)); break; } @@ -956,7 +954,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, iod_csums, &sgls[start]); daos_csummer_free_ic(csummer, &iod_csums); if (rc) { - D_ERROR("migrate failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate failed", DP_RB_MRO(mrone)); break; } iod_cnt = 0; @@ -968,7 +966,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, &sgls[start], fetch ? &csum_iov : &mrone->mo_csum_iov, &iod_csums); if (rc != 0) { - D_ERROR("Error calculating checksums: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": error calculating checksums", DP_RB_MRO(mrone)); D_GOTO(out, rc); } @@ -978,7 +976,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, &mrone->mo_iods[start], iod_csums, &sgls[start]); if (rc) { - D_ERROR("migrate failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate failed", DP_RB_MRO(mrone)); D_GOTO(out, rc); } daos_csummer_free_ic(csummer, &iod_csums); @@ -1037,15 +1035,14 @@ migrate_update_parity(struct migrate_one *mrone, daos_epoch_t parity_eph, tmp_recx.rx_nr = cell_nr; d_iov_set(&tmp_iov, p_bufs[shard], cell_nr * iod->iod_size); - D_DEBUG(DB_IO, "parity "DF_X64"/"DF_U64" "DF_U64"\n", - tmp_recx.rx_idx, tmp_recx.rx_nr, iod->iod_size); + D_DEBUG(DB_IO, DF_RB ": parity " DF_X64 "/" DF_U64 " " DF_U64 "\n", + DP_RB_MRO(mrone), tmp_recx.rx_idx, tmp_recx.rx_nr, iod->iod_size); } else { tmp_recx.rx_idx = offset; tmp_recx.rx_nr = write_nr; d_iov_set(&tmp_iov, buffer, write_nr * iod->iod_size); - D_DEBUG(DB_IO, "replicate "DF_U64"/"DF_U64" " - DF_U64"\n", tmp_recx.rx_idx, - tmp_recx.rx_nr, iod->iod_size); + D_DEBUG(DB_IO, DF_RB ": replicate " DF_U64 "/" DF_U64 " " DF_U64 "\n", + DP_RB_MRO(mrone), tmp_recx.rx_idx, tmp_recx.rx_nr, iod->iod_size); } tmp_sgl.sg_iovs = &tmp_iov; @@ -1054,8 +1051,7 @@ migrate_update_parity(struct migrate_one *mrone, daos_epoch_t parity_eph, rc = daos_csummer_calc_iods(csummer, &tmp_sgl, iod, NULL, 1, false, NULL, 0, &iod_csums); if (rc != 0) { - D_ERROR("Error calculating checksums: "DF_RC"\n", - DP_RC(rc)); + DL_ERROR(rc, DF_RB ": drror calculating checksums", DP_RB_MRO(mrone)); D_GOTO(out, rc); } @@ -1106,14 +1102,15 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, sgls[i].sg_iovs = &iov[i]; } - D_DEBUG(DB_REBUILD, DF_UOID" mrone %p dkey "DF_KEY" nr %d eph "DF_U64"\n", - DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), iods_num, mrone->mo_epoch); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " mrone %p dkey " DF_KEY " nr %d eph " DF_U64 "\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), iods_num, + mrone->mo_epoch); rc = mrone_obj_fetch(mrone, oh, sgls, iods, iods_num, fetch_eph, DIOF_FOR_MIGRATION, NULL); if (rc) { - D_ERROR("migrate dkey "DF_KEY" failed: "DF_RC"\n", - DP_KEY(&mrone->mo_dkey), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate dkey " DF_KEY " failed", DP_RB_MRO(mrone), + DP_KEY(&mrone->mo_dkey)); D_GOTO(out, rc); } @@ -1265,9 +1262,8 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, sgls[i].sg_iovs = &iov[i]; } - D_DEBUG(DB_REBUILD, - DF_UOID" mrone %p dkey "DF_KEY" nr %d eph "DF_U64"\n", - DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " mrone %p dkey " DF_KEY " nr %d eph " DF_U64 "\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), mrone->mo_iod_num, mrone->mo_epoch); if (!daos_oclass_is_ec(&mrone->mo_oca)) { @@ -1280,14 +1276,15 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, rc = mrone_obj_fetch(mrone, oh, sgls, mrone->mo_iods, mrone->mo_iod_num, mrone->mo_epoch, DIOF_FOR_MIGRATION, p_csum_iov); if (rc == -DER_CSUM) { - D_ERROR("migrate dkey "DF_KEY" failed because of checksum " - "error ("DF_RC"). Don't fail whole rebuild.\n", - DP_KEY(&mrone->mo_dkey), DP_RC(rc)); + DL_ERROR(rc, + DF_RB ": migrate dkey " DF_KEY " failed because of checksum error. " + "Don't fail whole rebuild", + DP_RB_MRO(mrone), DP_KEY(&mrone->mo_dkey)); D_GOTO(out, rc = 0); } if (rc) { - D_ERROR("migrate dkey "DF_KEY" failed: "DF_RC"\n", - DP_KEY(&mrone->mo_dkey), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate dkey " DF_KEY " failed", DP_RB_MRO(mrone), + DP_KEY(&mrone->mo_dkey)); D_GOTO(out, rc); } @@ -1311,13 +1308,11 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, */ rc = -DER_DATA_LOSS; D_DEBUG(DB_REBUILD, - DF_UOID" %p dkey "DF_KEY" "DF_KEY" nr %d/%d" - " eph "DF_U64" "DF_RC"\n", - DP_UOID(mrone->mo_oid), - mrone, DP_KEY(&mrone->mo_dkey), - DP_KEY(&mrone->mo_iods[i].iod_name), - mrone->mo_iod_num, i, mrone->mo_epoch, - DP_RC(rc)); + DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY + " nr %d/%d eph " DF_U64 " " DF_RC "\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, + DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), + mrone->mo_iod_num, i, mrone->mo_epoch, DP_RC(rc)); D_GOTO(out, rc); } @@ -1325,7 +1320,7 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, continue; if (obj_ec_singv_one_tgt(iod->iod_size, &sgls[i], &mrone->mo_oca)) { - D_DEBUG(DB_REBUILD, DF_UOID" one tgt.\n", + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " one tgt.\n", DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid)); los[i].cs_even_dist = 0; continue; @@ -1355,7 +1350,7 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, mrone->mo_iods[i].iod_size, &mrone->mo_oca); los[i].cs_nr = obj_ec_tgt_nr(&mrone->mo_oca); - D_DEBUG(DB_CSUM, "los[%d]: "DF_LAYOUT"\n", i, + D_DEBUG(DB_CSUM, DF_RB ": los[%d]: " DF_LAYOUT "\n", DP_RB_MRO(mrone), i, DP_LAYOUT(los[i])); } @@ -1363,7 +1358,7 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, rc = migrate_csum_calc(csummer, mrone, mrone->mo_iods, mrone->mo_iod_num, sgls, p_csum_iov, &iod_csums); if (rc != 0) { - D_ERROR("unable to calculate iod csums: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": unable to calculate iod csums", DP_RB_MRO(mrone)); goto out; } @@ -1414,16 +1409,16 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, &mrone->mo_dkey, iod_num, iods, mrone->mo_iods_csums, 0, &ioh, NULL); if (rc != 0) { - D_ERROR(DF_UOID ": preparing update fails: " DF_RC "\n", DP_UOID(mrone->mo_oid), - DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID ": preparing update failed", DP_RB_MRO(mrone), + DP_UOID(mrone->mo_oid)); return rc; } rc = bio_iod_prep(vos_ioh2desc(ioh), BIO_CHK_TYPE_REBUILD, NULL, CRT_BULK_RW); if (rc) { - D_ERROR("Prepare EIOD for "DF_UOID" error: "DF_RC"\n", - DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": prepare EIOD for " DF_UOID " error", DP_RB_MRO(mrone), + DP_UOID(mrone->mo_oid)); goto end; } @@ -1440,9 +1435,9 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, } D_DEBUG(DB_REBUILD, - DF_UOID" mrone %p dkey "DF_KEY" nr %d eph "DF_X64"/"DF_X64"\n", - DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), - iod_num, mrone->mo_epoch, update_eph); + DF_RB ": " DF_UOID " mrone %p dkey " DF_KEY " nr %d eph " DF_X64 "/" DF_X64 "\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), iod_num, + mrone->mo_epoch, update_eph); if (daos_oclass_is_ec(&mrone->mo_oca)) mrone_recx_vos2_daos(mrone, mrone->mo_oid.id_shard, iods, iod_num); @@ -1456,15 +1451,15 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, rc = mrone_obj_fetch(mrone, oh, sgls, iods, iod_num, fetch_eph, flags, p_csum_iov); if (rc) { - D_ERROR("migrate dkey "DF_KEY" failed: "DF_RC"\n", - DP_KEY(&mrone->mo_dkey), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate dkey " DF_KEY " failed", DP_RB_MRO(mrone), + DP_KEY(&mrone->mo_dkey)); D_GOTO(post, rc); } csummer = dsc_cont2csummer(dc_obj_hdl2cont_hdl(oh)); rc = migrate_csum_calc(csummer, mrone, iods, iod_num, sgls, p_csum_iov, &iod_csums); if (rc != 0) { - D_ERROR("Failed to calculate iod csums: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": failed to calculate iod csums", DP_RB_MRO(mrone)); D_GOTO(post, rc); } @@ -1478,8 +1473,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, rc = bio_iod_post(vos_ioh2desc(ioh), rc); if (rc) - D_ERROR("Post EIOD for "DF_UOID" error: "DF_RC"\n", - DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": post EIOD for " DF_UOID " error", DP_RB_MRO(mrone), + DP_UOID(mrone->mo_oid)); for (i = 0; rc == 0 && i < iod_num; i++) { if (iods[i].iod_size == 0) { @@ -1495,12 +1490,11 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, */ rc = -DER_DATA_LOSS; D_DEBUG(DB_REBUILD, - DF_UOID" %p dkey "DF_KEY" "DF_KEY" nr %d/%d" - " eph "DF_U64" "DF_RC"\n", - DP_UOID(mrone->mo_oid), - mrone, DP_KEY(&mrone->mo_dkey), - DP_KEY(&iods[i].iod_name), iod_num, i, mrone->mo_epoch, - DP_RC(rc)); + DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY + " nr %d/%d eph " DF_U64 " " DF_RC "\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, + DP_KEY(&mrone->mo_dkey), DP_KEY(&iods[i].iod_name), iod_num, i, + mrone->mo_epoch, DP_RC(rc)); D_GOTO(end, rc); } } @@ -1513,7 +1507,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, rc = rc1; if (rc) - D_ERROR(DF_UOID " migrate error: "DF_RC"\n", DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " migrate error", DP_RB_MRO(mrone), + DP_UOID(mrone->mo_oid)); return rc; } @@ -1598,8 +1593,10 @@ migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, * from parity will be rebuilt first. so let's ignore * this replicate recx for now. */ - D_WARN(DF_UOID" "DF_RECX"/"DF_X64" already rebuilt\n", - DP_UOID(mrone->mo_oid), DP_RECX(iod.iod_recxs[0]), + D_WARN(DF_RB ": " DF_UOID " " DF_RECX "/" DF_X64 " already " + "rebuilt\n", + DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), + DP_RECX(iod.iod_recxs[0]), mrone->mo_iods_update_ephs[i][j]); rc = 0; } else { @@ -1624,16 +1621,16 @@ migrate_punch(struct migrate_pool_tls *tls, struct migrate_one *mrone, /* Punch dkey */ if (mrone->mo_dkey_punch_eph != 0 && mrone->mo_dkey_punch_eph <= tls->mpt_max_eph) { - D_DEBUG(DB_REBUILD, DF_UOID" punch dkey "DF_KEY"/"DF_U64"\n", - DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " punch dkey " DF_KEY "/" DF_U64 "\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), mrone->mo_dkey_punch_eph); rc = vos_obj_punch(cont->sc_hdl, mrone->mo_oid, mrone->mo_dkey_punch_eph, tls->mpt_version, VOS_OF_REPLAY_PC, &mrone->mo_dkey, 0, NULL, NULL); if (rc) { - D_ERROR(DF_UOID" punch dkey failed: "DF_RC"\n", - DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " punch dkey failed", DP_RB_MPT(tls), + DP_UOID(mrone->mo_oid)); return rc; } } @@ -1644,17 +1641,19 @@ migrate_punch(struct migrate_pool_tls *tls, struct migrate_one *mrone, eph = mrone->mo_akey_punch_ephs[i]; D_ASSERT(eph != DAOS_EPOCH_MAX); if (eph == 0 || eph > tls->mpt_max_eph) { - D_DEBUG(DB_REBUILD, DF_UOID" skip mrone %p punch dkey " - DF_KEY" akey "DF_KEY" eph "DF_X64" current "DF_X64"\n", - DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), - DP_KEY(&mrone->mo_iods[i].iod_name), eph, mrone->mo_epoch); + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " skip mrone %p punch dkey " DF_KEY + " akey " DF_KEY " eph " DF_X64 " current " DF_X64 "\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, + DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), eph, + mrone->mo_epoch); continue; } - D_DEBUG(DB_REBUILD, DF_UOID" mrone %p punch dkey " - DF_KEY" akey "DF_KEY" eph "DF_U64"\n", - DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " mrone %p punch dkey " DF_KEY "akey " DF_KEY + " eph " DF_U64 "\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), eph); rc = vos_obj_punch(cont->sc_hdl, mrone->mo_oid, @@ -1682,10 +1681,10 @@ migrate_punch(struct migrate_pool_tls *tls, struct migrate_one *mrone, mrone->mo_version, 0, &mrone->mo_dkey, mrone->mo_punch_iod_num, mrone->mo_punch_iods, NULL, NULL); - D_DEBUG(DB_REBUILD, DF_UOID" mrone %p punch %d eph "DF_U64 - " records: "DF_RC"\n", DP_UOID(mrone->mo_oid), mrone, - mrone->mo_punch_iod_num, mrone->mo_rec_punch_eph, - DP_RC(rc)); + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " mrone %p punch %d eph " DF_U64 "records: " DF_RC "\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, mrone->mo_punch_iod_num, + mrone->mo_rec_punch_eph, DP_RC(rc)); } return rc; @@ -1700,8 +1699,7 @@ migrate_get_cont_child(struct migrate_pool_tls *tls, uuid_t cont_uuid, *cont_p = NULL; if (tls->mpt_pool->spc_pool->sp_stopping) { - D_DEBUG(DB_REBUILD, DF_UUID "pool is being destroyed.\n", - DP_UUID(tls->mpt_pool_uuid)); + D_DEBUG(DB_REBUILD, DF_RB ": pool is being destroyed.\n", DP_RB_MPT(tls)); return 0; } @@ -1712,8 +1710,10 @@ migrate_get_cont_child(struct migrate_pool_tls *tls, uuid_t cont_uuid, rc = ds_cont_child_open_create(tls->mpt_pool_uuid, cont_uuid, &cont_child); if (rc != 0) { if (rc == -DER_SHUTDOWN || (cont_child && cont_child->sc_stopping)) { - D_DEBUG(DB_REBUILD, DF_UUID "container is being destroyed\n", - DP_UUID(cont_uuid)); + D_DEBUG(DB_REBUILD, + DF_RB ": container " DF_UUID " is being " + "destroyed\n", + DP_RB_MPT(tls), DP_UUID(cont_uuid)); rc = 0; } if (cont_child) @@ -1724,8 +1724,10 @@ migrate_get_cont_child(struct migrate_pool_tls *tls, uuid_t cont_uuid, rc = ds_cont_child_lookup(tls->mpt_pool_uuid, cont_uuid, &cont_child); if (rc != 0 || (cont_child && cont_child->sc_stopping)) { if (rc == -DER_NONEXIST || (cont_child && cont_child->sc_stopping)) { - D_DEBUG(DB_REBUILD, DF_UUID "container is being destroyed\n", - DP_UUID(cont_uuid)); + D_DEBUG(DB_REBUILD, + DF_RB ": container " DF_UUID " is being " + "destroyed\n", + DP_RB_MPT(tls), DP_UUID(cont_uuid)); rc = 0; } @@ -1774,15 +1776,14 @@ migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, D_GOTO(obj_close, rc = -DER_NOSPACE); if (DAOS_FAIL_CHECK(DAOS_REBUILD_NO_REBUILD)) { - D_DEBUG(DB_REBUILD, DF_UUID" disable rebuild\n", - DP_UUID(tls->mpt_pool_uuid)); + D_DEBUG(DB_REBUILD, DF_RB ": fault injected, disable rebuild\n", DP_RB_MPT(tls)); D_GOTO(obj_close, rc); } dsc_cont_get_props(coh, &props); rc = dsc_obj_id2oc_attr(mrone->mo_oid.id_pub, &props, &mrone->mo_oca); if (rc) { - D_ERROR("Unknown object class: %u\n", + D_ERROR(DF_RB ": unknown object class: %u\n", DP_RB_MPT(tls), daos_obj_id2class(mrone->mo_oid.id_pub)); D_GOTO(obj_close, rc); } @@ -1794,8 +1795,8 @@ migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, tls->mpt_version, VOS_OF_REPLAY_PC, NULL, 0, NULL, NULL); if (rc) { - D_ERROR(DF_UOID" punch obj failed: "DF_RC"\n", - DP_UOID(mrone->mo_oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " punch obj failed", DP_RB_MPT(tls), + DP_UOID(mrone->mo_oid)); D_GOTO(obj_close, rc); } } @@ -1805,7 +1806,7 @@ migrate_dkey(struct migrate_pool_tls *tls, struct migrate_one *mrone, D_GOTO(obj_close, rc); if (data_size == 0) { - D_DEBUG(DB_REBUILD, "empty mrone %p\n", mrone); + D_DEBUG(DB_REBUILD, DF_RB ": empty mrone %p\n", DP_RB_MPT(tls), mrone); D_GOTO(obj_close, rc); } @@ -1901,8 +1902,8 @@ migrate_system_enter(struct migrate_pool_tls *tls, int tgt_idx, bool *yielded) atomic_load(&tls->mpt_dkey_ult_cnts[tgt_idx]); while ((tls->mpt_inflight_max_ult / dss_tgt_nr) <= tgt_cnt) { - D_DEBUG(DB_REBUILD, "tgt%d:%u max %u\n", - tgt_idx, tgt_cnt, tls->mpt_inflight_max_ult / dss_tgt_nr); + D_DEBUG(DB_REBUILD, DF_RB ": tgt%d:%u max %u\n", DP_RB_MPT(tls), tgt_idx, tgt_cnt, + tls->mpt_inflight_max_ult / dss_tgt_nr); *yielded = true; ABT_mutex_lock(tls->mpt_inflight_mutex); ABT_cond_wait(tls->mpt_inflight_cond, tls->mpt_inflight_mutex); @@ -1929,7 +1930,8 @@ migrate_tgt_enter(struct migrate_pool_tls *tls) dkey_cnt = atomic_load(tls->mpt_tgt_dkey_ult_cnt); while (tls->mpt_inflight_max_ult / 2 <= dkey_cnt) { - D_DEBUG(DB_REBUILD, "tgt %u max %u\n", dkey_cnt, tls->mpt_inflight_max_ult); + D_DEBUG(DB_REBUILD, DF_RB ": tgt %u max %u\n", DP_RB_MPT(tls), dkey_cnt, + tls->mpt_inflight_max_ult); ABT_mutex_lock(tls->mpt_inflight_mutex); ABT_cond_wait(tls->mpt_inflight_cond, tls->mpt_inflight_mutex); @@ -2020,8 +2022,7 @@ migrate_one_ult(void *arg) tls = migrate_pool_tls_lookup(mrone->mo_pool_uuid, mrone->mo_pool_tls_version, mrone->mo_generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(mrone->mo_pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(mrone->mo_pool_uuid)); goto out; } @@ -2029,18 +2030,19 @@ migrate_one_ult(void *arg) data_size += daos_iods_len(mrone->mo_iods_from_parity, mrone->mo_iods_num_from_parity); - D_DEBUG(DB_TRACE, "mrone %p data size is "DF_U64" %d/%d\n", - mrone, data_size, mrone->mo_iod_num, mrone->mo_iods_num_from_parity); + D_DEBUG(DB_TRACE, DF_RB ": mrone %p data size is " DF_U64 " %d/%d\n", DP_RB_MPT(tls), mrone, + data_size, mrone->mo_iod_num, mrone->mo_iods_num_from_parity); D_ASSERT(data_size != (daos_size_t)-1); - D_DEBUG(DB_REBUILD, "mrone %p inflight_size "DF_U64" max "DF_U64"\n", - mrone, tls->mpt_inflight_size, tls->mpt_inflight_max_size); + D_DEBUG(DB_REBUILD, DF_RB ": mrone %p inflight_size " DF_U64 " max " DF_U64 "\n", + DP_RB_MPT(tls), mrone, tls->mpt_inflight_size, tls->mpt_inflight_max_size); while (tls->mpt_inflight_size + data_size >= tls->mpt_inflight_max_size && tls->mpt_inflight_max_size != 0 && tls->mpt_inflight_size != 0 && !tls->mpt_fini) { - D_DEBUG(DB_REBUILD, "mrone %p wait "DF_U64"/"DF_U64"/"DF_U64"\n", mrone, - tls->mpt_inflight_size, tls->mpt_inflight_max_size, data_size); + D_DEBUG(DB_REBUILD, DF_RB ": mrone %p wait " DF_U64 "/" DF_U64 "/" DF_U64 "\n", + DP_RB_MPT(tls), mrone, tls->mpt_inflight_size, tls->mpt_inflight_max_size, + data_size); ABT_mutex_lock(tls->mpt_inflight_mutex); ABT_cond_wait(tls->mpt_inflight_cond, tls->mpt_inflight_mutex); ABT_mutex_unlock(tls->mpt_inflight_mutex); @@ -2053,8 +2055,10 @@ migrate_one_ult(void *arg) rc = migrate_dkey(tls, mrone, data_size); tls->mpt_inflight_size -= data_size; - D_DEBUG(DB_REBUILD, DF_UOID" layout %u migrate dkey "DF_KEY" inflight_size "DF_U64": " - DF_RC"\n", DP_UOID(mrone->mo_oid), mrone->mo_oid.id_layout_ver, + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " layout %u migrate dkey " DF_KEY " inflight_size " DF_U64 + ": " DF_RC "\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone->mo_oid.id_layout_ver, DP_KEY(&mrone->mo_dkey), tls->mpt_inflight_size, DP_RC(rc)); /* Ignore nonexistent error because puller could race @@ -2246,7 +2250,8 @@ rw_iod_pack(struct migrate_one *mrone, struct dc_object *obj, daos_iod_t *iod, if (iod->iod_type == DAOS_IOD_SINGLE) { rec_cnt = 1; total_size = iod->iod_size; - D_DEBUG(DB_REBUILD, "single recx "DF_U64"\n", total_size); + D_DEBUG(DB_REBUILD, DF_RB ": single recx " DF_U64 "\n", DP_RB_MRO(mrone), + total_size); rc = migrate_insert_recxs_sgl(mrone->mo_iods, mrone->mo_iods_update_ephs, &mrone->mo_iod_num, iod, &iod->iod_recxs[0], &ephs[0], 1, mrone->mo_sgls, sgl, 0); @@ -2285,9 +2290,10 @@ rw_iod_pack(struct migrate_one *mrone, struct dc_object *obj, daos_iod_t *iod, nr = 0; } parity_nr++; - D_DEBUG(DB_REBUILD, "parity recx "DF_X64"/"DF_X64" %d/%d\n", - iod->iod_recxs[i].rx_idx, iod->iod_recxs[i].rx_nr, - parity_nr, nr); + D_DEBUG(DB_REBUILD, + DF_RB ": parity recx " DF_X64 "/" DF_X64 " %d/%d\n", + DP_RB_MRO(mrone), iod->iod_recxs[i].rx_idx, + iod->iod_recxs[i].rx_nr, parity_nr, nr); iod->iod_recxs[i].rx_idx = iod->iod_recxs[i].rx_idx & ~PARITY_INDICATOR; } else { @@ -2308,9 +2314,10 @@ rw_iod_pack(struct migrate_one *mrone, struct dc_object *obj, daos_iod_t *iod, parity_nr = 0; } nr++; - D_DEBUG(DB_REBUILD, "replicate recx "DF_X64"/"DF_X64" %d/%d\n", - iod->iod_recxs[i].rx_idx, iod->iod_recxs[i].rx_nr, - parity_nr, nr); + D_DEBUG(DB_REBUILD, + DF_RB ": replicate recx " DF_X64 "/" DF_X64 " %d/%d\n", + DP_RB_MRO(mrone), iod->iod_recxs[i].rx_idx, + iod->iod_recxs[i].rx_nr, parity_nr, nr); } } @@ -2339,9 +2346,10 @@ rw_iod_pack(struct migrate_one *mrone, struct dc_object *obj, daos_iod_t *iod, mrone->mo_size += total_size; out: D_DEBUG(DB_REBUILD, - "idx %d akey "DF_KEY" nr %d size "DF_U64" type %d rec %d total " - DF_U64"\n", mrone->mo_iod_num - 1, DP_KEY(&iod->iod_name), - iod->iod_nr, iod->iod_size, iod->iod_type, rec_cnt, total_size); + DF_RB ": idx %d akey " DF_KEY " nr %d size " DF_U64 " type %d rec %d total " DF_U64 + "\n", + DP_RB_MRO(mrone), mrone->mo_iod_num - 1, DP_KEY(&iod->iod_name), iod->iod_nr, + iod->iod_size, iod->iod_type, rec_cnt, total_size); return rc; } @@ -2479,19 +2487,17 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io int i; int rc = 0; - D_DEBUG(DB_REBUILD, "migrate dkey "DF_KEY" iod nr %d\n", DP_KEY(dkey), - iod_eph_total); - tls = migrate_pool_tls_lookup(iter_arg->pool_uuid, iter_arg->version, iter_arg->generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(iter_arg->pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "dkey " DF_KEY "iod_nr %d\n", + DP_UUID(iter_arg->pool_uuid), DP_KEY(dkey), iod_eph_total); D_GOTO(put, rc = 0); } + D_DEBUG(DB_REBUILD, DF_RB ": migrate dkey " DF_KEY " iod nr %d\n", DP_RB_MPT(tls), + DP_KEY(dkey), iod_eph_total); if (iod_eph_total == 0 || tls->mpt_fini) { - D_DEBUG(DB_REBUILD, "No need eph_total %d version %u" - " migrate ver %u fini %d\n", iod_eph_total, version, - tls->mpt_version, tls->mpt_fini); + D_DEBUG(DB_REBUILD, DF_RB ": no need eph_total %d version %u fini %d\n", + DP_RB_MPT(tls), iod_eph_total, version, tls->mpt_fini); D_GOTO(put, rc = 0); } @@ -2546,6 +2552,7 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io mrone->mo_generation = tls->mpt_generation; mrone->mo_dkey_hash = io->ui_dkey_hash; mrone->mo_layout_version = obj->cob_layout_version; + mrone->mo_opc = tls->mpt_opc; /* only do the copy below when each with inline recx data */ for (i = 0; i < iod_eph_total; i++) { int j; @@ -2570,9 +2577,8 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io for (i = 0; i < iod_eph_total; i++) { if (akey_punch_ephs[i] != 0) { mrone->mo_akey_punch_ephs[i] = akey_punch_ephs[i]; - D_DEBUG(DB_REBUILD, "punched %d akey "DF_KEY" " - DF_U64"\n", i, DP_KEY(&iods[i].iod_name), - akey_punch_ephs[i]); + D_DEBUG(DB_REBUILD, DF_RB ": punched %d akey " DF_KEY " " DF_U64 "\n", + DP_RB_MPT(tls), i, DP_KEY(&iods[i].iod_name), akey_punch_ephs[i]); } if (iods[i].iod_nr == 0) @@ -2593,8 +2599,10 @@ migrate_one_create(struct enum_unpack_arg *arg, struct dc_obj_enum_unpack_io *io D_GOTO(free, rc); } - D_DEBUG(DB_REBUILD, DF_UOID" %p dkey "DF_KEY" migrate on idx %d iod_num %d min eph "DF_U64 - " ver %u\n", DP_UOID(mrone->mo_oid), mrone, DP_KEY(dkey), iter_arg->tgt_idx, + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " %p dkey " DF_KEY " migrate on idx %d iod_num %d " + "min eph " DF_U64 " ver %u\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, DP_KEY(dkey), iter_arg->tgt_idx, mrone->mo_iod_num, mrone->mo_min_epoch, version); d_list_add(&mrone->mo_list, &arg->merge_list); @@ -2646,8 +2654,7 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) tls = migrate_pool_tls_lookup(arg->arg->pool_uuid, arg->arg->version, arg->arg->generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(arg->arg->pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->arg->pool_uuid)); D_GOTO(put, rc = 0); } @@ -2663,8 +2670,8 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) if (rc == 1 && (is_ec_data_shard_by_tgt_off(unpack_tgt_off, &arg->oc_attr) || (io->ui_oid.id_layout_ver > 0 && io->ui_oid.id_shard != parity_shard))) { - D_DEBUG(DB_REBUILD, DF_UOID" ignore shard "DF_KEY"/%u/%d/%u/%d.\n", - DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " ignore shard " DF_KEY "/%u/%d/%u/%d.\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, (int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc); D_GOTO(put, rc = 0); } @@ -2681,9 +2688,11 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) continue; } - D_DEBUG(DB_REBUILD, DF_UOID" unpack "DF_KEY" for shard %u/%u/%u/"DF_X64"/%u\n", - DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, unpack_tgt_off, - migrate_tgt_off, io->ui_dkey_hash, parity_shard); + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " unpack " DF_KEY " for shard " + "%u/%u/%u/" DF_X64 "/%u\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard, + unpack_tgt_off, migrate_tgt_off, io->ui_dkey_hash, parity_shard); /** * Since we do not need split the rebuild into parity rebuild @@ -2693,22 +2702,23 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) rc = obj_recx_ec2_daos(&arg->oc_attr, unpack_tgt_off, &iod->iod_recxs, ephs, &iod->iod_nr, false); if (rc != 0) { - D_ERROR(DF_UOID" ec 2 daos %u failed: "DF_RC"\n", - DP_UOID(io->ui_oid), shard, DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " ec 2 daos %u failed", DP_RB_MPT(tls), + DP_UOID(io->ui_oid), shard); D_GOTO(put, rc); } /* Filter the DAOS recxs to the rebuild data shard */ if (is_ec_data_shard_by_layout_ver(layout_ver, io->ui_dkey_hash, &arg->oc_attr, shard)) { - D_DEBUG(DB_REBUILD, DF_UOID" convert shard %u tgt %d\n", - DP_UOID(io->ui_oid), shard, obj_ec_data_tgt_nr(&arg->oc_attr)); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " convert shard %u tgt %d\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), shard, + obj_ec_data_tgt_nr(&arg->oc_attr)); rc = obj_recx_ec_daos2shard(&arg->oc_attr, migrate_tgt_off, &iod->iod_recxs, ephs, &iod->iod_nr); if (rc) { - D_ERROR(DF_UOID" daos to shard %u failed: "DF_RC"\n", - DP_UOID(io->ui_oid), shard, DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " daos to shard %u failed", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), shard); D_GOTO(put, rc); } } @@ -2720,8 +2730,8 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) if (!create_migrate_one) { struct ds_cont_child *cont = NULL; - D_DEBUG(DB_REBUILD, DF_UOID"/"DF_KEY" does not need rebuild.\n", - DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey)); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID "/" DF_KEY " does not need rebuild.\n", + DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey)); /* Create the vos container when no record need to be rebuilt for this shard, * for the case of reintegrate the container was discarded ahead. @@ -2729,7 +2739,8 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) rc = migrate_get_cont_child(tls, arg->arg->cont_uuid, &cont, true); if (cont != NULL) ds_cont_child_put(cont); - D_GOTO(put, rc); + + D_GOTO(put, rc = 0); } /* Check if some IODs from this unpack can be merged to the exist mrone, mostly for EC @@ -2774,14 +2785,12 @@ migrate_obj_punch_one(void *data) tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(arg->pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); D_GOTO(put, rc = 0); } - D_DEBUG(DB_REBUILD, "tls %p "DF_UUID" version %d punch "DF_U64" "DF_UOID"\n", - tls, DP_UUID(tls->mpt_pool_uuid), arg->version, arg->punched_epoch, - DP_UOID(arg->oid)); + D_DEBUG(DB_REBUILD, DF_RB ": tls %p version %d punch " DF_U64 " " DF_UOID "\n", + DP_RB_MPT(tls), tls, arg->version, arg->punched_epoch, DP_UOID(arg->oid)); rc = migrate_get_cont_child(tls, arg->cont_uuid, &cont, true); if (rc != 0 || cont == NULL) @@ -2794,8 +2803,8 @@ migrate_obj_punch_one(void *data) ds_cont_child_put(cont); put: if (rc) - D_ERROR(DF_UOID" migrate punch failed: "DF_RC"\n", - DP_UOID(arg->oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": " DF_UOID " migrate punch failed", DP_RB_MPT(tls), + DP_UOID(arg->oid)); if (tls) { if (tls->mpt_status == 0 && rc != 0) tls->mpt_status = rc; @@ -2816,16 +2825,16 @@ migrate_start_ult(struct enum_unpack_arg *unpack_arg) tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(arg->pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); D_GOTO(put, rc = 0); } d_list_for_each_entry_safe(mrone, tmp, &unpack_arg->merge_list, mo_list) { - D_DEBUG(DB_REBUILD, DF_UOID" %p dkey "DF_KEY" migrate on idx %d" - " iod_num %d\n", DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), arg->tgt_idx, - mrone->mo_iod_num); + D_DEBUG(DB_REBUILD, + DF_RB ": " DF_UOID " %p dkey " DF_KEY " migrate on idx %d" + " iod_num %d\n", + DP_RB_MPT(tls), DP_UOID(mrone->mo_oid), mrone, DP_KEY(&mrone->mo_dkey), + arg->tgt_idx, mrone->mo_iod_num); rc = migrate_tgt_enter(tls); if (rc) @@ -2877,13 +2886,11 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, uint32_t num; int rc = 0; - D_DEBUG(DB_REBUILD, "migrate obj "DF_UOID" for shard %u eph " - DF_X64"-"DF_X64"\n", DP_UOID(arg->oid), arg->shard, epr->epr_lo, - epr->epr_hi); + D_DEBUG(DB_REBUILD, DF_RB ": migrate obj " DF_UOID " shard %u eph " DF_X64 "-" DF_X64 "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, epr->epr_lo, epr->epr_hi); if (tls->mpt_fini) { - D_DEBUG(DB_REBUILD, DF_UUID "migration is aborted.\n", - DP_UUID(tls->mpt_pool_uuid)); + D_DEBUG(DB_REBUILD, DF_RB ": migration is aborted.\n", DP_RB_MPT(tls)); return 0; } @@ -2893,13 +2900,13 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, NULL, tls->mpt_pool->spc_pool->sp_map, &tls->mpt_svc_list, &tls->mpt_pool_hdl); if (rc) { - D_ERROR("dsc_pool_open failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dsc_pool_open failed", DP_RB_MPT(tls)); D_GOTO(out, rc); } rc = migrate_cont_open(tls, arg->cont_uuid, 0, &coh); if (rc) { - D_ERROR("migrate_cont_open failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": migrate_cont_open failed", DP_RB_MPT(tls)); D_GOTO(out, rc); } @@ -2908,7 +2915,7 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, */ rc = dsc_obj_open(coh, arg->oid.id_pub, DAOS_OO_RO, &oh); if (rc) { - D_ERROR("dsc_obj_open failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dsc_obj_open failed", DP_RB_MPT(tls)); D_GOTO(out, rc); } @@ -2923,8 +2930,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, dsc_cont_get_props(coh, &props); rc = dsc_obj_id2oc_attr(arg->oid.id_pub, &props, &unpack_arg.oc_attr); if (rc) { - D_ERROR("Unknown object class: %u\n", - daos_obj_id2class(arg->oid.id_pub)); + DL_ERROR(rc, DF_RB ": unknown object class: %u", DP_RB_MPT(tls), + daos_obj_id2class(arg->oid.id_pub)); D_GOTO(out_obj, rc); } @@ -2981,9 +2988,10 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, &dkey_anchor, &akey_anchor, p_csum); if (rc == -DER_KEY2BIG) { - D_DEBUG(DB_REBUILD, "migrate obj "DF_UOID" got " - "-DER_KEY2BIG, key_len "DF_U64"\n", - DP_UOID(arg->oid), kds[0].kd_key_len); + D_DEBUG(DB_REBUILD, + DF_RB ": migrate obj " DF_UOID " got -DER_KEY2BIG, " + "key_len " DF_U64 "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), kds[0].kd_key_len); /* For EC parity migration, it will enumerate from all data * shards, so buffer needs to time grp_size to make sure * retry buffer will be large enough. @@ -3004,8 +3012,10 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, continue; } else if (rc == -DER_TRUNC && p_csum != NULL && p_csum->iov_len > p_csum->iov_buf_len) { - D_DEBUG(DB_REBUILD, "migrate obj csum buf " - "not large enough. Increase and try again"); + D_DEBUG(DB_REBUILD, + DF_RB ": migrate obj csum buf not large enough. " + "Increase and try again\n", + DP_RB_MPT(tls)); if (p_csum->iov_buf != stack_csum_buf) D_FREE(p_csum->iov_buf); @@ -3021,24 +3031,25 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, daos_anchor_get_flags(&dkey_anchor) & DIOF_TO_LEADER) { if (rc != -DER_INPROGRESS) { enum_flags &= ~DIOF_TO_LEADER; - D_DEBUG(DB_REBUILD, "retry to non leader " - DF_UOID": "DF_RC"\n", - DP_UOID(arg->oid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, + DF_RB ": retry to non leader " DF_UOID ": " DF_RC "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), DP_RC(rc)); } else { /* Keep retry on leader if it is inprogress or shutdown, * since the new dtx leader might still resync the * uncommitted records, or it will choose a new leader * once the pool map is updated. */ - D_DEBUG(DB_REBUILD, "retry leader "DF_UOID"\n", - DP_UOID(arg->oid)); + D_DEBUG(DB_REBUILD, DF_RB ": retry leader " DF_UOID "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid)); } continue; } else if (rc == -DER_UPDATE_AGAIN) { /* -DER_UPDATE_AGAIN means the remote target does not parse EC * aggregation yet, so let's retry. */ - D_DEBUG(DB_REBUILD, DF_UOID "retry with %d \n", DP_UOID(arg->oid), rc); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " retry with %d \n", DP_RB_MPT(tls), + DP_UOID(arg->oid), rc); rc = 0; continue; } else if (rc) { @@ -3047,9 +3058,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, */ if (rc == -DER_TIMEDOUT && tls->mpt_version + 1 >= tls->mpt_pool->spc_map_version) { - D_WARN(DF_UUID" retry "DF_UOID" "DF_RC"\n", - DP_UUID(tls->mpt_pool_uuid), DP_UOID(arg->oid), - DP_RC(rc)); + D_WARN(DF_RB ": retry " DF_UOID " " DF_RC "\n", DP_RB_MPT(tls), + DP_UOID(arg->oid), DP_RC(rc)); rc = 0; continue; } @@ -3065,19 +3075,21 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, * -DER_NONEXIST, see obj_ioc_init(). */ if (rc == -DER_DATA_LOSS || rc == -DER_NONEXIST) { - D_WARN("No replicas for "DF_UOID" %d\n", DP_UOID(arg->oid), rc); + D_WARN(DF_RB ": mo replicas for " DF_UOID " %d\n", DP_RB_MPT(tls), + DP_UOID(arg->oid), rc); num = 0; rc = 0; } - D_DEBUG(DB_REBUILD, "Can not rebuild "DF_UOID" "DF_RC" mpt %u spc %u\n", - DP_UOID(arg->oid), DP_RC(rc), tls->mpt_version, tls->mpt_pool->spc_map_version); + D_DEBUG(DB_REBUILD, DF_RB ": cannot rebuild " DF_UOID " " DF_RC " spc %u\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), DP_RC(rc), + tls->mpt_pool->spc_map_version); break; } /* Each object enumeration RPC will at least one OID */ if (num <= minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) { - D_DEBUG(DB_REBUILD, "enumeration buffer %u empty" - DF_UOID"\n", num, DP_UOID(arg->oid)); + D_DEBUG(DB_REBUILD, DF_RB ": enumeration buffer %u empty" DF_UOID "\n", + DP_RB_MPT(tls), num, DP_UOID(arg->oid)); break; } @@ -3086,15 +3098,15 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, rc = dc_obj_enum_unpack(arg->oid, kds, num, &sgl, p_csum, migrate_enum_unpack_cb, &unpack_arg); if (rc) { - D_ERROR("migrate "DF_UOID" failed: %d\n", - DP_UOID(arg->oid), rc); + DL_ERROR(rc, DF_RB ": migrate " DF_UOID " failed", DP_RB_MPT(tls), + DP_UOID(arg->oid)); break; } rc = migrate_start_ult(&unpack_arg); if (rc) { - D_ERROR("start migrate "DF_UOID" failed: "DF_RC"\n", - DP_UOID(arg->oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": start migrate " DF_UOID " failed", DP_RB_MPT(tls), + DP_UOID(arg->oid)); break; } @@ -3113,9 +3125,9 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, out_obj: dsc_obj_close(oh); out: - D_DEBUG(DB_REBUILD, "obj "DF_UOID" for shard %u eph " - DF_U64"-"DF_U64": "DF_RC"\n", DP_UOID(arg->oid), arg->shard, - epr->epr_lo, epr->epr_hi, DP_RC(rc)); + D_DEBUG(DB_REBUILD, + DF_RB ": obj " DF_UOID " shard %u eph " DF_U64 "-" DF_U64 ": " DF_RC "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, epr->epr_lo, epr->epr_hi, DP_RC(rc)); return rc; } @@ -3234,8 +3246,7 @@ migrate_obj_ult(void *data) tls = migrate_pool_tls_lookup(arg->pool_uuid, arg->version, arg->generation); if (tls == NULL || tls->mpt_fini) { - D_WARN("some one abort the rebuild "DF_UUID"\n", - DP_UUID(arg->pool_uuid)); + D_WARN("someone aborted the rebuild " DF_UUID "\n", DP_UUID(arg->pool_uuid)); D_GOTO(free_notls, rc); } @@ -3246,16 +3257,16 @@ migrate_obj_ult(void *data) */ if (tls->mpt_pool->spc_pool->sp_need_discard) { while(!tls->mpt_pool->spc_discard_done) { - D_DEBUG(DB_REBUILD, DF_UUID" wait for discard to finish.\n", - DP_UUID(arg->pool_uuid)); + D_DEBUG(DB_REBUILD, DF_RB ": wait for discard to finish.\n", + DP_RB_MPT(tls)); dss_sleep(2 * 1000); if (tls->mpt_fini) D_GOTO(free_notls, rc); } if (tls->mpt_pool->spc_pool->sp_discard_status) { rc = tls->mpt_pool->spc_pool->sp_discard_status; - D_DEBUG(DB_REBUILD, DF_UUID " discard failure: " DF_RC, - DP_UUID(arg->pool_uuid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB ": discard failure: " DF_RC "\n", DP_RB_MPT(tls), + DP_RC(rc)); D_GOTO(out, rc); } } @@ -3263,8 +3274,8 @@ migrate_obj_ult(void *data) for (i = 0; i < arg->snap_cnt; i++) { epr.epr_lo = i > 0 ? arg->snaps[i - 1] + 1 : 0; epr.epr_hi = arg->snaps[i]; - D_DEBUG(DB_REBUILD, "rebuild_snap %d "DF_X64"-"DF_X64"\n", - i, epr.epr_lo, epr.epr_hi); + D_DEBUG(DB_REBUILD, DF_RB ": rebuild_snap %d " DF_X64 "-" DF_X64 "\n", + DP_RB_MPT(tls), i, epr.epr_lo, epr.epr_hi); rc = migrate_one_epoch_object(&epr, tls, arg); if (rc) D_GOTO(free, rc); @@ -3283,9 +3294,10 @@ migrate_obj_ult(void *data) rc = migrate_one_epoch_object(&epr, tls, arg); } else { /* The obj has been punched for this range */ - D_DEBUG(DB_REBUILD, "punched obj "DF_UOID" epoch" - " "DF_U64"/"DF_U64"/"DF_U64"\n", DP_UOID(arg->oid), - arg->epoch, arg->punched_epoch, epr.epr_hi); + D_DEBUG(DB_REBUILD, + DF_RB ": punched obj " DF_UOID " epoch " DF_U64 "/" DF_U64 "/" DF_U64 "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), arg->epoch, arg->punched_epoch, + epr.epr_hi); arg->epoch = DAOS_EPOCH_MAX; } free: @@ -3312,11 +3324,11 @@ migrate_obj_ult(void *data) if (tls->mpt_status == 0 && rc < 0) tls->mpt_status = rc; - D_DEBUG(DB_REBUILD, ""DF_UUID"/%u stop migrate obj "DF_UOID - " for shard %u ult %u/%u "DF_U64" : " DF_RC"\n", - DP_UUID(tls->mpt_pool_uuid), tls->mpt_version, - DP_UOID(arg->oid), arg->shard, atomic_load(tls->mpt_tgt_obj_ult_cnt), - atomic_load(tls->mpt_tgt_dkey_ult_cnt), tls->mpt_obj_count, DP_RC(rc)); + D_DEBUG( + DB_REBUILD, + DF_RB ": stop migrate obj " DF_UOID "for shard %u ult %u/%u " DF_U64 " : " DF_RC "\n", + DP_RB_MPT(tls), DP_UOID(arg->oid), arg->shard, atomic_load(tls->mpt_tgt_obj_ult_cnt), + atomic_load(tls->mpt_tgt_dkey_ult_cnt), tls->mpt_obj_count, DP_RC(rc)); free_notls: if (tls != NULL) migrate_tgt_exit(tls, OBJ_ULT); @@ -3384,9 +3396,8 @@ migrate_one_object(daos_unit_oid_t oid, daos_epoch_t eph, daos_epoch_t punched_e d_iov_set(&val_iov, &val, sizeof(struct migrate_obj_val)); rc = obj_tree_insert(toh, cont_arg->cont_uuid, -1, oid, &val_iov); - D_DEBUG(DB_REBUILD, "Insert "DF_UUID"/"DF_UUID"/"DF_UOID": ver %u " - "ult %u/%u "DF_RC"\n", DP_UUID(tls->mpt_pool_uuid), - DP_UUID(cont_arg->cont_uuid), DP_UOID(oid), tls->mpt_version, + D_DEBUG(DB_REBUILD, DF_RB ": insert " DF_UUID "/" DF_UOID ": ult %u/%u " DF_RC "\n", + DP_RB_MPT(tls), DP_UUID(cont_arg->cont_uuid), DP_UOID(oid), atomic_load(&tls->mpt_obj_ult_cnts[tgt_idx]), atomic_load(&tls->mpt_dkey_ult_cnts[tgt_idx]), DP_RC(rc)); @@ -3416,20 +3427,21 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * if (arg->pool_tls->mpt_fini) return 1; - D_DEBUG(DB_REBUILD, "obj migrate "DF_UUID"/"DF_UOID" %"PRIx64 - " eph "DF_U64" start\n", DP_UUID(arg->cont_uuid), DP_UOID(*oid), - ih.cookie, epoch); + D_DEBUG(DB_REBUILD, + DF_RB ": obj migrate " DF_UUID "/" DF_UOID " %" PRIx64 " eph " DF_U64 " start\n", + DP_RB_MPT(arg->pool_tls), DP_UUID(arg->cont_uuid), DP_UOID(*oid), ih.cookie, epoch); rc = migrate_system_enter(arg->pool_tls, tgt_idx, &yielded); if (rc != 0) { - DL_ERROR(rc, DF_UUID" enter migrate failed.", DP_UUID(arg->cont_uuid)); + DL_ERROR(rc, DF_RB ": " DF_UUID " enter migrate failed.", DP_RB_MPT(arg->pool_tls), + DP_UUID(arg->cont_uuid)); return rc; } rc = migrate_one_object(*oid, epoch, punched_epoch, shard, tgt_idx, arg); if (rc != 0) { - D_ERROR("obj "DF_UOID" migration failed: "DF_RC"\n", - DP_UOID(*oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": obj " DF_UOID " migration failed", DP_RB_MPT(arg->pool_tls), + DP_UOID(*oid)); migrate_system_exit(arg->pool_tls, tgt_idx); return rc; } @@ -3440,14 +3452,15 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * rc = dbtree_iter_probe(ih, BTR_PROBE_EQ, DAOS_INTENT_MIGRATION, &tmp_iov, NULL); if (rc) { D_ASSERT(rc != -DER_NONEXIST); - D_ERROR("obj "DF_UOID" probe failed: "DF_RC"\n", DP_UOID(*oid), DP_RC(rc)); + DL_ERROR(rc, DF_RB ": obj " DF_UOID " probe failed", + DP_RB_MPT(arg->pool_tls), DP_UOID(*oid)); return rc; } } rc = dbtree_iter_delete(ih, NULL); if (rc) { - D_ERROR("dbtree_iter_delete failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dbtree_iter_delete failed", DP_RB_MPT(arg->pool_tls)); return rc; } @@ -3462,7 +3475,7 @@ migrate_obj_iter_cb(daos_handle_t ih, d_iov_t *key_iov, d_iov_t *val_iov, void * if (rc == -DER_NONEXIST) return 1; else if (rc != 0) - D_ERROR("dbtree_iter_probe failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dbtree_iter_probe failed", DP_RB_MPT(arg->pool_tls)); return rc; } @@ -3487,22 +3500,20 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, int rc; uuid_copy(cont_uuid, *(uuid_t *)key_iov->iov_buf); - D_DEBUG(DB_REBUILD, "iter cont "DF_UUID"/%"PRIx64" %"PRIx64" start\n", - DP_UUID(cont_uuid), ih.cookie, root->root_hdl.cookie); + D_DEBUG(DB_REBUILD, DF_RB ": iter cont " DF_UUID "/%" PRIx64 " %" PRIx64 " start\n", + DP_RB_MPT(tls), DP_UUID(cont_uuid), ih.cookie, root->root_hdl.cookie); rc = ds_pool_lookup(tls->mpt_pool_uuid, &dp); if (rc) { - D_ERROR(DF_UUID" ds_pool_lookup failed: "DF_RC"\n", - DP_UUID(tls->mpt_pool_uuid), DP_RC(rc)); - if (rc == -DER_SHUTDOWN) - rc = 0; + DL_ERROR(rc, DF_RB ": ds_pool_lookup failed", DP_RB_MPT(tls)); + rc = 0; D_GOTO(out_put, rc); } rc = ds_cont_fetch_snaps(dp->sp_iv_ns, cont_uuid, &snapshots, &snap_cnt); if (rc) { - D_ERROR("ds_cont_fetch_snaps failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": ds_cont_fetch_snaps failed", DP_RB_MPT(tls)); D_GOTO(out_put, rc); } @@ -3512,8 +3523,8 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, * since EC boundary does not start yet, which is forbidden * during rebuild anyway, so let's continue. */ - D_DEBUG(DB_REBUILD, DF_UUID" fetch agg_boundary failed: "DF_RC"\n", - DP_UUID(cont_uuid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UUID " fetch agg_boundary failed: " DF_RC "\n", + DP_RB_MPT(tls), DP_UUID(cont_uuid), DP_RC(rc)); } arg.yield_freq = DEFAULT_YIELD_FREQ; @@ -3532,7 +3543,7 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, break; } - D_DEBUG(DB_REBUILD, "iter cont "DF_UUID"/%"PRIx64" finish.\n", + D_DEBUG(DB_REBUILD, DF_RB ": iter cont " DF_UUID "/%" PRIx64 " finish.\n", DP_RB_MPT(tls), DP_UUID(cont_uuid), ih.cookie); rc = dbtree_destroy(root->root_hdl, NULL); @@ -3540,7 +3551,7 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, /* Ignore the DRAM migrate object tree for the moment, since * it does not impact the migration on the storage anyway */ - D_ERROR("dbtree_destroy failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dbtree_destroy failed", DP_RB_MPT(tls)); } /* Snapshot fetch will yield the ULT, let's reprobe before delete */ @@ -3554,7 +3565,7 @@ migrate_cont_iter_cb(daos_handle_t ih, d_iov_t *key_iov, rc = dbtree_iter_delete(ih, NULL); if (rc) { - D_ERROR("dbtree_iter_delete failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dbtree_iter_delete failed", DP_RB_MPT(tls)); D_GOTO(free, rc); } @@ -3595,7 +3606,7 @@ migrate_ult(void *arg) DAOS_INTENT_PURGE, false, migrate_cont_iter_cb, pool_tls); if (rc < 0) { - D_ERROR("dbtree iterate failed: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": dbtree iterate failed", DP_RB_MPT(pool_tls)); if (pool_tls->mpt_status == 0) pool_tls->mpt_status = rc; break; @@ -3627,7 +3638,7 @@ migrate_try_create_object_tree(struct migrate_pool_tls *tls) &tls->mpt_root, &tls->mpt_root_hdl); if (rc != 0) { - D_ERROR("failed to create tree: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": failed to create tree", DP_RB_MPT(tls)); return rc; } } @@ -3640,7 +3651,7 @@ migrate_try_create_object_tree(struct migrate_pool_tls *tls) &tls->mpt_migrated_root, &tls->mpt_migrated_root_hdl); if (rc != 0) { - D_ERROR("failed to create tree: "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, DF_RB ": failed to create migrated tree", DP_RB_MPT(tls)); return rc; } } @@ -3671,22 +3682,23 @@ migrate_try_obj_insert(struct migrate_pool_tls *tls, uuid_t co_uuid, val.punched_epoch = punched_epoch; val.shard = shard; val.tgt_idx = tgt_idx; - D_DEBUG(DB_REBUILD, "Insert migrate "DF_UUID"/"DF_UOID" "DF_U64"/"DF_U64 - "/%d/%d\n", DP_UUID(co_uuid), DP_UOID(oid), epoch, punched_epoch, - shard, tgt_idx); + D_DEBUG(DB_REBUILD, + DF_RB ": insert migrate " DF_UUID "/" DF_UOID " " DF_U64 "/" DF_U64 "/%d/%d\n", + DP_RB_MPT(tls), DP_UUID(co_uuid), DP_UOID(oid), epoch, punched_epoch, shard, + tgt_idx); d_iov_set(&val_iov, &val, sizeof(struct migrate_obj_val)); rc = obj_tree_lookup(toh, co_uuid, oid, &val_iov); if (rc != -DER_NONEXIST) { - D_DEBUG(DB_REBUILD, DF_UUID"/"DF_UOID" not need insert: " - DF_RC"\n", DP_UUID(co_uuid), DP_UOID(oid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UUID "/" DF_UOID " no insert needed: " DF_RC "\n", + DP_RB_MPT(tls), DP_UUID(co_uuid), DP_UOID(oid), DP_RC(rc)); return rc; } rc = obj_tree_lookup(migrated_toh, co_uuid, oid, &val_iov); if (rc != -DER_NONEXIST) { - D_DEBUG(DB_REBUILD, DF_UUID"/"DF_UOID" not need insert: " - DF_RC"\n", DP_UUID(co_uuid), DP_UOID(oid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UUID "/" DF_UOID " no insert needed: " DF_RC "\n", + DP_RB_MPT(tls), DP_UUID(co_uuid), DP_UOID(oid), DP_RC(rc)); return rc; } @@ -3725,13 +3737,14 @@ ds_migrate_object(struct ds_pool *pool, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_ rc = migrate_try_obj_insert(tls, co_uuid, oids[i], epochs[i], punched_epochs[i], shards[i], tgt_idx); if (rc == -DER_EXIST) { - D_DEBUG(DB_TRACE, DF_UOID"/"DF_UUID"exists.\n", - DP_UOID(oids[i]), DP_UUID(co_uuid)); + D_DEBUG(DB_TRACE, DF_RB ": " DF_UOID "/" DF_UUID "exists.\n", + DP_RB_MPT(tls), DP_UOID(oids[i]), DP_UUID(co_uuid)); rc = 0; continue; } else if (rc < 0) { - D_ERROR(DF_UOID"/"DF_U64"/"DF_UUID"/%u insert failed: %d\n", - DP_UOID(oids[i]), epochs[i], DP_UUID(co_uuid), shards[i], rc); + DL_ERROR(rc, DF_RB ": " DF_UOID "/" DF_U64 "/" DF_UUID "/%u insert failed", + DP_RB_MPT(tls), DP_UOID(oids[i]), epochs[i], DP_UUID(co_uuid), + shards[i]); break; } } @@ -3746,7 +3759,7 @@ ds_migrate_object(struct ds_pool *pool, uuid_t po_hdl, uuid_t co_hdl, uuid_t co_ migrate_pool_tls_get(tls); rc = dss_ult_create(migrate_ult, tls, DSS_XS_SELF, 0, MIGRATE_STACK_SIZE, NULL); if (rc) { - D_ERROR("Create migrate ULT failed: rc %d\n", rc); + DL_ERROR(rc, DF_RB ": create migrate ULT failed", DP_RB_MPT(tls)); tls->mpt_ult_running = 0; migrate_pool_tls_put(tls); } @@ -3791,13 +3804,13 @@ ds_obj_migrate_handler(crt_rpc_t *rpc) if (oids_count == 0 || shards_count == 0 || ephs_count == 0 || oids_count != shards_count || oids_count != ephs_count) { - D_ERROR("oids %u shards %u ephs %d\n", - oids_count, shards_count, ephs_count); + D_ERROR(DF_RB ": oids %u shards %u ephs %d\n", DP_RB_OMI(migrate_in), oids_count, + shards_count, ephs_count); D_GOTO(out, rc = -DER_INVAL); } if (migrate_in->om_tgt_idx >= dss_tgt_nr) { - D_ERROR("Wrong tgt idx %d\n", migrate_in->om_tgt_idx); + D_ERROR(DF_RB " wrong tgt idx %d\n", DP_RB_OMI(migrate_in), migrate_in->om_tgt_idx); D_GOTO(out, rc = -DER_INVAL); } @@ -3809,12 +3822,12 @@ ds_obj_migrate_handler(crt_rpc_t *rpc) rc = ds_pool_lookup(po_uuid, &pool); if (rc != 0) { if (rc == -DER_SHUTDOWN) { - D_DEBUG(DB_REBUILD, DF_UUID" pool service is stopping.\n", - DP_UUID(po_uuid)); + D_DEBUG(DB_REBUILD, DF_RB " pool service is stopping.\n", + DP_RB_OMI(migrate_in)); rc = 0; } else { - D_DEBUG(DB_REBUILD, DF_UUID" pool service is not started yet. "DF_RC"\n", - DP_UUID(po_uuid), DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB " pool service is not started yet. " DF_RC "\n", + DP_RB_OMI(migrate_in), DP_RC(rc)); rc = -DER_AGAIN; } D_GOTO(out, rc); @@ -3823,8 +3836,7 @@ ds_obj_migrate_handler(crt_rpc_t *rpc) ds_rebuild_running_query(migrate_in->om_pool_uuid, -1, &rebuild_ver, NULL, NULL); if (rebuild_ver == 0 || rebuild_ver != migrate_in->om_version) { rc = -DER_SHUTDOWN; - DL_ERROR(rc, DF_UUID" rebuild ver %u om version %u", - DP_UUID(migrate_in->om_pool_uuid), rebuild_ver, migrate_in->om_version); + DL_ERROR(rc, DF_RB " rebuild ver %u", DP_RB_OMI(migrate_in), rebuild_ver); D_GOTO(out, rc); } @@ -3872,7 +3884,7 @@ migrate_check_one(void *data) atomic_load(tls->mpt_tgt_dkey_ult_cnt); ABT_mutex_unlock(arg->status_lock); D_DEBUG(DB_REBUILD, - DF_RB " status %d/%d/ ult %u/%u rec/obj/size " DF_U64 "/" DF_U64 "/" DF_U64 "\n", + DF_RB " status %d/%d/ ult %u/%u rec/obj/size " DF_U64 "/" DF_U64 "/" DF_U64 "\n", DP_RB_MQA(arg), tls->mpt_status, arg->dms.dm_status, atomic_load(tls->mpt_tgt_obj_ult_cnt), atomic_load(tls->mpt_tgt_dkey_ult_cnt), tls->mpt_rec_count, tls->mpt_obj_count, tls->mpt_size); @@ -4040,8 +4052,7 @@ ds_object_migrate_send(struct ds_pool *pool, uuid_t pool_hdl_uuid, uuid_t cont_h *max_delay = rpc_timeout; } out: - D_DEBUG(DB_REBUILD, DF_RB ": rc=%d\n", DP_UUID(pool->sp_uuid), version, generation, - RB_OP_STR(migrate_opc), rc); + D_DEBUG(DB_REBUILD, DF_RB ": rc=%d\n", DP_RB_OMI(migrate_in), rc); if (rpc) crt_req_decref(rpc); diff --git a/src/pool/srv_cli.c b/src/pool/srv_cli.c index 5630394b154..bbc228dd2b1 100644 --- a/src/pool/srv_cli.c +++ b/src/pool/srv_cli.c @@ -337,7 +337,7 @@ dsc_pool_svc_call(uuid_t uuid, d_rank_list_t *ranks, struct dsc_pool_svc_call_cb struct pool_query_arg { d_rank_list_t **pqa_enabled_ranks; d_rank_list_t **pqa_disabled_ranks; - d_rank_list_t **pqa_suspect_ranks; + d_rank_list_t **pqa_dead_ranks; daos_pool_info_t *pqa_info; uint32_t *pqa_layout_ver; uint32_t *pqa_upgrade_layout_ver; @@ -369,7 +369,7 @@ pool_query_init(uuid_t pool_uuid, crt_rpc_t *rpc, void *varg) } static int -pool_map_get_suspect_ranks(struct pool_map *map, d_rank_list_t **ranks) +pool_map_get_dead_ranks(struct pool_map *map, d_rank_list_t **ranks) { crt_group_t *primary_grp; struct pool_domain *doms; @@ -418,7 +418,7 @@ pool_map_get_suspect_ranks(struct pool_map *map, d_rank_list_t **ranks) static int process_query_result(d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ranks, - d_rank_list_t **suspect_ranks, daos_pool_info_t *info, uuid_t pool_uuid, + d_rank_list_t **dead_ranks, daos_pool_info_t *info, uuid_t pool_uuid, uint32_t map_version, uint32_t leader_rank, struct daos_pool_space *ps, struct daos_rebuild_status *rs, struct pool_buf *map_buf, uint64_t pi_bits) { @@ -426,7 +426,7 @@ process_query_result(d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ran unsigned int num_disabled = 0; d_rank_list_t *enabled_rank_list = NULL; d_rank_list_t *disabled_rank_list = NULL; - d_rank_list_t *suspect_rank_list = NULL; + d_rank_list_t *dead_rank_list = NULL; int rc; rc = pool_map_create(map_buf, map_version, &map); @@ -474,21 +474,21 @@ process_query_result(d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ran D_DEBUG(DB_MD, DF_UUID ": found %" PRIu32 " disabled ranks in pool map\n", DP_UUID(pool_uuid), disabled_rank_list->rl_nr); } - if ((pi_bits & DPI_ENGINES_SUSPECT) != 0) { - if (suspect_ranks == NULL) { + if ((pi_bits & DPI_ENGINES_DEAD) != 0) { + if (dead_ranks == NULL) { DL_ERROR(-DER_INVAL, - DF_UUID ": query pool requested suspect ranks, but ptr is NULL", + DF_UUID ": query pool requested dead ranks, but ptr is NULL", DP_UUID(pool_uuid)); D_GOTO(error, rc = -DER_INVAL); } - rc = pool_map_get_suspect_ranks(map, &suspect_rank_list); + rc = pool_map_get_dead_ranks(map, &dead_rank_list); if (rc != 0) { DL_ERROR(rc, DF_UUID ": pool_map_get_ranks() failed", DP_UUID(pool_uuid)); D_GOTO(error, rc); } - D_DEBUG(DB_MD, DF_UUID ": found %" PRIu32 " suspect ranks in pool map\n", - DP_UUID(pool_uuid), suspect_rank_list->rl_nr); + D_DEBUG(DB_MD, DF_UUID ": found %" PRIu32 " dead ranks in pool map\n", + DP_UUID(pool_uuid), dead_rank_list->rl_nr); } pool_query_reply_to_info(pool_uuid, map_buf, map_version, leader_rank, ps, rs, info); @@ -497,14 +497,14 @@ process_query_result(d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ran *enabled_ranks = enabled_rank_list; if (disabled_rank_list != NULL) *disabled_ranks = disabled_rank_list; - if (suspect_rank_list != NULL) - *suspect_ranks = suspect_rank_list; + if (dead_rank_list != NULL) + *dead_ranks = dead_rank_list; D_GOTO(out, rc = -DER_SUCCESS); error: d_rank_list_free(disabled_rank_list); d_rank_list_free(enabled_rank_list); - d_rank_list_free(suspect_rank_list); + d_rank_list_free(dead_rank_list); out: if (map != NULL) pool_map_decref(map); @@ -534,7 +534,7 @@ pool_query_consume(uuid_t pool_uuid, crt_rpc_t *rpc, void *varg) D_DEBUG(DB_MGMT, DF_UUID": Successfully queried pool\n", DP_UUID(pool_uuid)); rc = process_query_result( - arg->pqa_enabled_ranks, arg->pqa_disabled_ranks, arg->pqa_suspect_ranks, arg->pqa_info, + arg->pqa_enabled_ranks, arg->pqa_disabled_ranks, arg->pqa_dead_ranks, arg->pqa_info, pool_uuid, out->pqo_op.po_map_version, out->pqo_op.po_hint.sh_rank, &out->pqo_space, &out->pqo_rebuild_st, arg->pqa_map_buf, arg->pqa_info->pi_bits); if (arg->pqa_layout_ver) @@ -572,7 +572,7 @@ static struct dsc_pool_svc_call_cbs pool_query_cbs = { * \param[in] deadline Unix time deadline in milliseconds * \param[out] enabled_ranks Optional, storage ranks with enabled targets. * \param[out] disabled_ranks Optional, storage ranks with disabled ranks. - * \param[out] suspect_ranks Optional, suspect ranks marked as DEAD by the SWIM + * \param[out] dead_ranks Optional, storage ranks marked as DEAD by the SWIM * protocol, but were not excluded from the system. * \param[in][out] pool_info Results of the pool query * \param[in][out] pool_layout_ver Results of the current pool global version @@ -588,13 +588,13 @@ static struct dsc_pool_svc_call_cbs pool_query_cbs = { int dsc_pool_svc_query(uuid_t pool_uuid, d_rank_list_t *ps_ranks, uint64_t deadline, d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ranks, - d_rank_list_t **suspect_ranks, daos_pool_info_t *pool_info, + d_rank_list_t **dead_ranks, daos_pool_info_t *pool_info, uint32_t *pool_layout_ver, uint32_t *upgrade_layout_ver) { struct pool_query_arg arg = { .pqa_enabled_ranks = enabled_ranks, .pqa_disabled_ranks = disabled_ranks, - .pqa_suspect_ranks = suspect_ranks, + .pqa_dead_ranks = dead_ranks, .pqa_info = pool_info, .pqa_layout_ver = pool_layout_ver, .pqa_upgrade_layout_ver = upgrade_layout_ver, diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 7fa0f33ea5f..d66ae12c328 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -51,13 +51,25 @@ uint32_t ds_pool_get_vos_df_version(uint32_t pool_global_version) { - if (pool_global_version >= 3) + if (pool_global_version == 4) + return VOS_POOL_DF_2_8; + if (pool_global_version == 3) return VOS_POOL_DF_2_6; else if (pool_global_version == 2) return VOS_POOL_DF_2_4; return 0; } +/** Return the VOS DF version for the default pool global version. */ +uint32_t +ds_pool_get_vos_df_version_default(void) +{ + uint32_t v = ds_pool_get_vos_df_version(DAOS_POOL_GLOBAL_VERSION); + + D_ASSERT(v != 0); + return v; +} + #define DUP_OP_MIN_RDB_SIZE (1 << 30) /* Pool service crt event */ @@ -1023,7 +1035,7 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group, d_iov_set(&psid, (void *)pool_uuid, sizeof(uuid_t)); rc = ds_rsvc_dist_start(DS_RSVC_CLASS_POOL, &psid, pool_uuid, ranks, RDB_NIL_TERM, DS_RSVC_CREATE, true /* bootstrap */, ds_rsvc_get_md_cap(), - 0 /* vos_df_version */); + ds_pool_get_vos_df_version_default()); if (rc != 0) D_GOTO(out_ranks, rc); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index bf64b7aacab..5d19f784d79 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -408,9 +408,18 @@ pool_child_recreate(struct ds_pool_child *child) struct dss_module_info *info = dss_get_module_info(); struct smd_pool_info *pool_info; struct stat lstat; + uint32_t vos_df_version; char *path; int rc; + vos_df_version = ds_pool_get_vos_df_version(child->spc_pool->sp_global_version); + if (vos_df_version == 0) { + rc = -DER_NO_PERM; + DL_ERROR(rc, DF_UUID ": pool global version %u not supported", + DP_UUID(child->spc_uuid), child->spc_pool->sp_global_version); + return rc; + } + rc = ds_mgmt_tgt_file(child->spc_uuid, VOS_FILE, &info->dmi_tgt_id, &path); if (rc != 0) return rc; @@ -450,7 +459,7 @@ pool_child_recreate(struct ds_pool_child *child) rc = vos_pool_create(path, child->spc_uuid, 0 /* scm_sz */, pool_info->spi_blob_sz[SMD_DEV_TYPE_DATA], pool_info->spi_blob_sz[SMD_DEV_TYPE_META], - 0 /* flags */, 0 /* version */, NULL); + 0 /* flags */, vos_df_version, NULL); if (rc) DL_ERROR(rc, DF_UUID": Create VOS pool failed.", DP_UUID(child->spc_uuid)); diff --git a/src/proto/mgmt/pool.proto b/src/proto/mgmt/pool.proto index ce1615d498e..b6a7535cd7e 100644 --- a/src/proto/mgmt/pool.proto +++ b/src/proto/mgmt/pool.proto @@ -241,7 +241,7 @@ message PoolQueryResp { repeated uint32 svc_reps = 19; // service replica ranks uint64 query_mask = 20; // Bitmask of pool query options used uint64 mem_file_bytes = 21; // per-pool accumulated value of memory file sizes - string suspect_ranks = 22; // optional set of suspect ranks + string dead_ranks = 22; // optional set of dead ranks } message PoolProperty { diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 42c40f81430..28a8d6b747f 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -122,6 +122,11 @@ struct rebuild_global_pool_tracker { /** rebuild status for each server */ struct rebuild_server_status *rgt_servers; + double *rgt_servers_last_update; + double rgt_last_warn; + + /** indirect indices for binary search by rank */ + struct rebuild_server_status **rgt_servers_sorted; /** the current version being rebuilt */ uint32_t rgt_rebuild_ver; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 047800d4b1c..964954ad4d2 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -633,8 +633,8 @@ rebuild_object(struct rebuild_tgt_pool_tracker *rpt, uuid_t co_uuid, daos_unit_o if (myrank == target->ta_comp.co_rank && mytarget == target->ta_comp.co_index && (shard == oid.id_shard) && rpt->rt_rebuild_op != RB_OP_UPGRADE) { - D_DEBUG(DB_REBUILD, DF_UOID" %u/%u already on the target shard\n", - DP_UOID(oid), myrank, mytarget); + D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " %u/%u already on the target shard\n", + DP_RB_RPT(rpt), DP_UOID(oid), myrank, mytarget); return 0; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index e051d606794..4b8cdc7c029 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -161,22 +161,70 @@ is_rebuild_global_done(struct rebuild_global_pool_tracker *rgt) #define SCAN_DONE 0x1 #define PULL_DONE 0x2 + +static void +servers_sop_swap(void *array, int a, int b) +{ + struct rebuild_server_status **servers = (struct rebuild_server_status **)array; + struct rebuild_server_status *tmp; + + tmp = servers[a]; + servers[a] = servers[b]; + servers[b] = tmp; +} + +static int +servers_sop_cmp(void *array, int a, int b) +{ + struct rebuild_server_status **servers = (struct rebuild_server_status **)array; + + if (servers[a]->rank > servers[b]->rank) + return 1; + if (servers[a]->rank < servers[b]->rank) + return -1; + return 0; +} + +static int +servers_sop_cmp_key(void *array, int i, uint64_t key) +{ + struct rebuild_server_status **servers = (struct rebuild_server_status **)array; + d_rank_t rank = (d_rank_t)key; + + if (servers[i]->rank > rank) + return 1; + if (servers[i]->rank < rank) + return -1; + return 0; +} + +static daos_sort_ops_t servers_sort_ops = { + .so_swap = servers_sop_swap, + .so_cmp = servers_sop_cmp, + .so_cmp_key = servers_sop_cmp_key, +}; + +static struct rebuild_server_status * +rebuild_server_get_status(struct rebuild_global_pool_tracker *rgt, d_rank_t rank) +{ + int idx; + + idx = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank, + &servers_sort_ops); + if (idx < 0) + return NULL; + return rgt->rgt_servers_sorted[idx]; +} + static void rebuild_leader_set_status(struct rebuild_global_pool_tracker *rgt, d_rank_t rank, uint32_t resync_ver, unsigned flags) { - struct rebuild_server_status *status = NULL; - int i; + struct rebuild_server_status *status = NULL; D_ASSERT(rgt->rgt_servers_number > 0); D_ASSERT(rgt->rgt_servers != NULL); - for (i = 0; i < rgt->rgt_servers_number; i++) { - if (rgt->rgt_servers[i].rank == rank) { - status = &rgt->rgt_servers[i]; - break; - } - } - + status = rebuild_server_get_status(rgt, rank); if (status == NULL) { D_INFO("rank %u is not included in this rebuild.\n", rank); return; @@ -189,6 +237,20 @@ rebuild_leader_set_status(struct rebuild_global_pool_tracker *rgt, status->pull_done = 1; } +static void +rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t rank) +{ + int i; + + i = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank, + &servers_sort_ops); + if (i >= 0) { + rgt->rgt_servers_last_update[i] = ABT_get_wtime(); + return; + } + D_INFO("rank %u is not included in this rebuild.\n", rank); +} + static uint32_t rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) { @@ -261,6 +323,8 @@ int rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt, struct rebuild_iv *iv) { + rebuild_leader_set_update_time(rgt, iv->riv_rank); + D_DEBUG(DB_REBUILD, "iv rank %d scan_done %d pull_done %d resync dtx %u\n", iv->riv_rank, iv->riv_scan_done, iv->riv_pull_done, iv->riv_dtx_resyc_version); @@ -637,6 +701,39 @@ enum { RB_BCAST_QUERY, }; +static void +warn_for_slow_engine_updates(struct rebuild_global_pool_tracker *rgt) +{ + int i; + double now = ABT_get_wtime(); + double tw = now - rgt->rgt_last_warn; /* time since last warning logged */ + bool warned = false; + + /* Throttle warnings to not more often than once per 2 minutes */ + if (tw < 120) + return; + + /* Warn for ranks not done and that haven't provided updates in a while (> 30 sec) */ + for (i = 0; i < rgt->rgt_servers_number; i++) { + double tu = now - rgt->rgt_servers_last_update[i]; + d_rank_t r = rgt->rgt_servers[i].rank; + + if (rgt->rgt_servers[i].scan_done && rgt->rgt_servers[i].pull_done) + continue; + + if (tu > 30) { + D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. " + "scan_done=%d pull_done=%d\n", + DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done, + rgt->rgt_servers[i].pull_done); + warned = true; + } + } + + if (warned) + rgt->rgt_last_warn = now; +} + /* * Check rebuild status on the leader. Every other target sends * its own rebuild status by IV. @@ -766,6 +863,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, D_PRINT("%s", sbuf); } sleep: + warn_for_slow_engine_updates(rgt); sched_req_sleep(rgt->rgt_ult, RBLD_CHECK_INTV); } @@ -780,6 +878,10 @@ rebuild_global_pool_tracker_destroy(struct rebuild_global_pool_tracker *rgt) d_list_del(&rgt->rgt_list); if (rgt->rgt_servers) D_FREE(rgt->rgt_servers); + if (rgt->rgt_servers_sorted) + D_FREE(rgt->rgt_servers_sorted); + if (rgt->rgt_servers_last_update) + D_FREE(rgt->rgt_servers_last_update); if (rgt->rgt_lock) ABT_mutex_free(&rgt->rgt_lock); @@ -798,6 +900,7 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t struct rebuild_global_pool_tracker *rgt; int rank_nr; struct pool_domain *doms; + double now; int i; int rc = 0; @@ -813,6 +916,24 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t D_ALLOC_ARRAY(rgt->rgt_servers, rank_nr); if (rgt->rgt_servers == NULL) D_GOTO(out, rc = -DER_NOMEM); + D_ALLOC_ARRAY(rgt->rgt_servers_sorted, rank_nr); + if (rgt->rgt_servers_sorted == NULL) + D_GOTO(out, rc = -DER_NOMEM); + D_ALLOC_ARRAY(rgt->rgt_servers_last_update, rank_nr); + if (rgt->rgt_servers_last_update == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + now = ABT_get_wtime(); + rgt->rgt_last_warn = now; + for (i = 0; i < rank_nr; i++) { + rgt->rgt_servers_sorted[i] = &rgt->rgt_servers[i]; + rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; + rgt->rgt_servers_last_update[i] = now; + } + rgt->rgt_servers_number = rank_nr; + + rc = daos_array_sort(rgt->rgt_servers_sorted, rank_nr, true, &servers_sort_ops); + D_ASSERT(rc == 0); rc = ABT_mutex_create(&rgt->rgt_lock); if (rc != ABT_SUCCESS) @@ -822,10 +943,6 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t if (rc != ABT_SUCCESS) D_GOTO(out, rc = dss_abterr2der(rc)); - for (i = 0; i < rank_nr; i++) - rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; - rgt->rgt_servers_number = rank_nr; - uuid_copy(rgt->rgt_pool_uuid, pool->sp_uuid); rgt->rgt_rebuild_ver = ver; rgt->rgt_status.rs_version = ver; @@ -937,10 +1054,10 @@ rebuild_prepare(struct ds_pool *pool, uint32_t rebuild_ver, ret = rebuild_global_pool_tracker_create(pool, rebuild_ver, rebuild_gen, leader_term, reclaim_eph, rebuild_op, rgt); if (ret) { + rc = ret; DL_ERROR(rc, DF_RB " rebuild_global_pool_tracker create failed", DP_UUID(pool->sp_uuid), rebuild_ver, rebuild_gen, RB_OP_STR(rebuild_op)); - rc = ret; } } diff --git a/src/tests/ftest/control/dmg_pool_query_ranks.py b/src/tests/ftest/control/dmg_pool_query_ranks.py index 74afa3b3982..5570e195b17 100644 --- a/src/tests/ftest/control/dmg_pool_query_ranks.py +++ b/src/tests/ftest/control/dmg_pool_query_ranks.py @@ -48,17 +48,17 @@ def test_pool_query_ranks_basic(self): self._verify_ranks([0, 1, 2, 3, 4], data, "enabled_ranks") self._verify_ranks([], data, "disabled_ranks") - self.log_step("Checking pool query with suspect ranks state information") + self.log_step("Checking pool query with dead ranks state information") data = self.dmg.pool_query(self.pool.identifier, health_only=True) - self._verify_ranks([], data, "suspect_ranks") + self._verify_ranks([], data, "dead_ranks") def test_pool_query_ranks_mgmt(self): """Test the state of ranks after excluding and reintegrate them. Test Description: Create a pool with 5 engines, first excluded engine marked as "Disabled" - second stopped one as “Suspect,” restarting it, ensuring rebuild completes, - clearing the “Suspect” status, reintegrating the excluded first engine, and + second stopped one as “Dead,” restarting it, ensuring rebuild completes, + clearing the “Dead” status, reintegrating the excluded first engine, and finally verifying that all engines are enabled with the excluded rank now empty. :avocado: tags=all,daily_regression @@ -74,7 +74,7 @@ def test_pool_query_ranks_mgmt(self): all_ranks = enabled_ranks.copy() self.random.shuffle(all_ranks) exclude_rank = all_ranks[0] - suspect_rank = all_ranks[1] + dead_rank = all_ranks[1] self.log_step(f"Excluding pool rank:{exclude_rank} all_ranks={all_ranks}") self.pool.exclude([exclude_rank]) enabled_ranks.remove(exclude_rank) @@ -89,18 +89,18 @@ def test_pool_query_ranks_mgmt(self): self.pool.wait_for_rebuild_to_start() # kill second rank. - self.log_step(f"Stopping rank:{suspect_rank} all_ranks={all_ranks}") - self.server_managers[0].stop_ranks([suspect_rank], self.d_log) + self.log_step(f"Stopping rank:{dead_rank} all_ranks={all_ranks}") + self.server_managers[0].stop_ranks([dead_rank], self.d_log) - self.log_step(f"Waiting for pool rank {suspect_rank} to be suspected") - self.pool.wait_pool_suspect_ranks([suspect_rank], timeout=30) + self.log_step(f"Waiting for pool rank {dead_rank} to be dead") + self.pool.wait_pool_dead_ranks([dead_rank], timeout=30) self._verify_ranks(disabled_ranks, data, "disabled_ranks") - self.log_step(f"Starting rank {suspect_rank}") - self.server_managers[0].start_ranks([suspect_rank], self.d_log) + self.log_step(f"Starting rank {dead_rank}") + self.server_managers[0].start_ranks([dead_rank], self.d_log) - self.log_step("Waiting for pool ranks to no longer be suspected") - self.pool.wait_pool_suspect_ranks([], timeout=30) + self.log_step("Waiting for pool ranks to no longer be dead") + self.pool.wait_pool_dead_ranks([], timeout=30) self.log_step("Waiting for rebuild to complete") self.pool.wait_for_rebuild_to_end() diff --git a/src/tests/ftest/dfuse/container_attrs.py b/src/tests/ftest/dfuse/container_attrs.py new file mode 100644 index 00000000000..cea930e98f9 --- /dev/null +++ b/src/tests/ftest/dfuse/container_attrs.py @@ -0,0 +1,188 @@ +""" + (C) Copyright 2020-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" + +import os +import re + +from apricot import TestWithServers +from dfuse_utils import get_dfuse, start_dfuse +from run_utils import run_remote + + +class DfuseContainerAttrs(TestWithServers): + """Check if the dfuse attributes of a container are properly managed. + + :avocado: recursive + """ + + def __init__(self, *args, **kwargs): + """Initialize a DfuseContainerAttrs object""" + super().__init__(*args, **kwargs) + self.dfuse_hosts = None + + def setUp(self): + """Set up each test case.""" + # obtain separate logs + self.update_log_file_names() + + # Start the servers and agents + super().setUp() + + # pylint: disable-next=fixme + # FIXME DFuse is mounted and checked on the launcher node until DAOS-7164 will be fixed. At + # this time, it is not possible to create a DFuse sub-container and destroy it properly as + # it has to be done on one of the client node holding the root DFuse mount point. + self.dfuse_hosts = self.agent_managers[0].hosts + + def _check_attrs(self, dfuse, attrs=None, namespace=None): + """Check if the DFuse attributes of a container are loaded + + Check in the log file of the dfuse instance if it contains the DFuse attributes of a given + container. It also checks the value of the attributes found. + + Args: + dfuse (Dfuse): DFuse instance to check + attrs (dict, optional): list of attributes to test + Defaults to None + namespace (str, optional): Namespace for TestContainer parameters in the test yaml file. + Defaults to None + """ + if attrs is None: + attrs = {} + + if namespace is not None: + for attr in self.params.get("attrs", namespace).split(","): + key, value = attr.split(':') + if key not in attrs: + attrs[key] = value + + log_file = os.linesep.join(dfuse.get_log_file_data().output[0].stdout) + for name, value in attrs.items(): + match = re.findall( + fr"^.+\ssetting\s+'{name}'\s+is\s+(\d+)\s+seconds$", + log_file, + re.MULTILINE) + self.assertEqual( + len(match), + 1, + f"Unexpected number setting(s) of attribute {name}: want=1, got={len(match)}") + self.assertEqual( + value, + match[0], + f"Unexpected value for attribute {name}: want={value}, got={match[0]}") + + def test_dfuse_container_create_attrs(self): + """Jira ID: DAOS-14698. + + Test Description: + Create a container with DFuse attributes + Mount a DFuse mount point + Check the output of the DFuse log + :avocado: tags=all,daily_regression + :avocado: tags=vm + :avocado: tags=dfuse,container + :avocado: tags=DfuseContainerAttrs,test_dfuse_container_create_attrs + """ + self.log.info("Creating DAOS pool") + pool = self.get_pool() + + self.log_step("Creating DAOS container with Dfuse attributes") + container = self.get_container(pool, namespace="/run/container_01/*") + + self.log_step("Mounting DFuse mount point") + dfuse = get_dfuse(self, self.dfuse_hosts) + dfuse.env["D_LOG_FLUSH"] = "INFO" + start_dfuse(self, dfuse, pool, container) + + self.log_step("Checking DFuse log file") + self._check_attrs(dfuse, namespace="/run/container_01/*") + + self.log_step("Test passed") + + def test_dfuse_subcontainer_create_attrs(self): + """Jira ID: DAOS-14698. + + Test Description: + Create a container + Mount a DFuse mount point + Create a sub-container with DFuse attributes + Check the output of the DFuse log + + :avocado: tags=all,daily_regression + :avocado: tags=vm + :avocado: tags=dfuse,container + :avocado: tags=DfuseContainerAttrs,test_dfuse_subcontainer_create_attrs + """ + self.log.info("Creating DAOS pool") + pool = self.get_pool() + + self.log_step("Creating DAOS container") + container = self.get_container(pool, namespace="/run/container_02/*") + + self.log_step("Mounting DFuse mount point") + dfuse = get_dfuse(self, self.dfuse_hosts) + dfuse.env["D_LOG_FLUSH"] = "INFO" + start_dfuse(self, dfuse, pool, container) + + self.log_step("Creating DAOS subcontainer with DFuse attributes") + sub_dir = os.path.join(dfuse.mount_dir.value, "foo") + self.get_container(pool, namespace="/run/container_03/*", path=sub_dir) + + self.log_step("Checking DFuse log file") + self._check_attrs(dfuse, namespace="/run/container_03/*") + + self.log_step("Test passed") + + def test_dfuse_subcontainer_set_attrs(self): + """Jira ID: DAOS-14698. + + Test Description: + Create a container + Mount a DFuse mount point + Create a sub-container + Set DFuse attributes to the sub-container + Evict the DFuse sub-container + Stat the DFuse sub-container mount point + Check the output of the DFuse log + + :avocado: tags=all,daily_regression + :avocado: tags=vm + :avocado: tags=dfuse,container + :avocado: tags=DfuseContainerAttrs,test_dfuse_subcontainer_set_attrs + """ + self.log.info("Creating DAOS pool") + pool = self.get_pool() + + self.log_step("Creating DAOS container") + container = self.get_container(pool, namespace="/run/container_04/*") + + self.log_step("Mounting DFuse mount point") + dfuse = get_dfuse(self, self.dfuse_hosts) + dfuse.env["D_LOG_FLUSH"] = "INFO" + start_dfuse(self, dfuse, pool, container) + + self.log_step("Creating DAOS sub-container") + sub_dir = os.path.join(dfuse.mount_dir.value, "bar") + sub_container = self.get_container(pool, namespace="/run/container_05/*", path=sub_dir) + + self.log_step("Setting DFuse attributes to the DAOS sub-container") + attrs = { + "dfuse-attr-time": "153", + "dfuse-dentry-time": "407"} + sub_container.daos.container_set_attr(pool.identifier, sub_container.identifier, attrs) + + self.log_step("Evicting the DAOS sub-container") + sub_container.daos.filesystem_evict(sub_dir) + + self.log_step("Running stat on the DFuse sub-container mount point") + result = run_remote(self.log, self.dfuse_hosts, f"stat {sub_dir}") + if not result.passed: + self.fail(f"stat on {sub_dir} can not be run on {result.failed_hosts}") + + self.log_step("Checking DFuse log file") + self._check_attrs(dfuse, attrs=attrs) + + self.log_step("Test passed") diff --git a/src/tests/ftest/dfuse/container_attrs.yaml b/src/tests/ftest/dfuse/container_attrs.yaml new file mode 100644 index 00000000000..d27a44fbf71 --- /dev/null +++ b/src/tests/ftest/dfuse/container_attrs.yaml @@ -0,0 +1,47 @@ +hosts: + test_servers: 1 + # TODO DAOS-14698: DFuse is mounted and checked on the launcher node until DAOS-7164 will be + # fixed. At this time, it is not possible to create a DFuse sub-container and destroy it properly + # as it has to be done on one of the client node holding the root DFuse mount point. + # test_clients: 1 + +timeout: 100 + +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 + +pool: + size: 1G + +container_01: + type: POSIX + attrs: dfuse-attr-time:666,dfuse-dentry-time:999 + control_method: daos + +container_02: + type: POSIX + control_method: daos + +container_03: + type: POSIX + attrs: dfuse-attr-time:42,dfuse-dentry-time:404 + control_method: daos + +container_04: + type: POSIX + control_method: daos + +container_05: + type: POSIX + control_method: daos diff --git a/src/tests/ftest/util/daos_utils.py b/src/tests/ftest/util/daos_utils.py index c916761b25d..10768bf3c8f 100644 --- a/src/tests/ftest/util/daos_utils.py +++ b/src/tests/ftest/util/daos_utils.py @@ -75,7 +75,7 @@ def pool_autotest(self, pool): def container_create(self, pool, sys_name=None, path=None, cont_type=None, oclass=None, dir_oclass=None, file_oclass=None, chunk_size=None, - properties=None, acl_file=None, label=None): + properties=None, acl_file=None, label=None, attrs=None): # pylint: disable=too-many-arguments """Create a container. @@ -96,6 +96,8 @@ def container_create(self, pool, sys_name=None, path=None, cont_type=None, pairs defining the container properties. Defaults to None acl_file (str, optional): ACL file. Defaults to None. label (str, optional): Container label. Defaults to None. + attrs (str, optional): String of comma-separated : pairs defining the + container user attributes. Defaults to None. Returns: dict: the daos json command output converted to a python dictionary @@ -110,10 +112,12 @@ def container_create(self, pool, sys_name=None, path=None, cont_type=None, properties += ',rd_lvl:1' else: properties = 'rd_lvl:1' + return self._get_json_result( ("container", "create"), pool=pool, sys_name=sys_name, path=path, type=cont_type, oclass=oclass, dir_oclass=dir_oclass, file_oclass=file_oclass, - chunk_size=chunk_size, properties=properties, acl_file=acl_file, label=label) + chunk_size=chunk_size, properties=properties, acl_file=acl_file, label=label, + attrs=attrs) def container_clone(self, src, dst): """Clone a container to a new container. @@ -961,6 +965,26 @@ def filesystem_copy(self, src, dst, preserve_props=None): return self._get_result( ("filesystem", "copy"), src=src, dst=dst, preserve_props=preserve_props) + def filesystem_evict(self, path): + """Evict local resources of a DFuse mounted path. + + Args: + path (str): The source, formatted as + + Returns: + CmdResult: Object that contains exit status, stdout, and other + information. + + Todo: + As for the container create with path, this command should have a given list of host on + which to apply. This should be done in the context of the ticket DAOS-7164. + + Raises: + CommandFailure: if the daos filesystem copy command fails. + + """ + return self._get_result(("filesystem", "evict"), path=path) + def version(self): """Call daos version. diff --git a/src/tests/ftest/util/daos_utils_base.py b/src/tests/ftest/util/daos_utils_base.py index 00fec9fe60c..a4594d518e0 100644 --- a/src/tests/ftest/util/daos_utils_base.py +++ b/src/tests/ftest/util/daos_utils_base.py @@ -300,6 +300,9 @@ def __init__(self): # --acl-file=PATH # input file containing ACL self.acl_file = FormattedParameter("--acl-file={}", None) + # --attrs=:[,:,...] + # user-defined attributes + self.attrs = FormattedParameter("--attrs={}", None) class CreateSnapSubCommand(CommonContainerSubCommand): """Defines an object for the daos container create-snap command.""" @@ -594,6 +597,8 @@ def get_sub_command_class(self): """Get the daos filesystem sub command object.""" if self.sub_command.value == "copy": self.sub_command_class = self.CopySubCommand() + elif self.sub_command.value == "evict": + self.sub_command_class = self.EvictSubCommand() else: self.sub_command_class = None @@ -623,6 +628,14 @@ def __init__(self): # filename to write and read container properties self.preserve_props = FormattedParameter("--preserve-props={}") + class EvictSubCommand(CommonFilesystemSubCommand): + """Defines an object for the daos filesystem evict command.""" + + def __init__(self): + """Create a daos filesystem evict command object.""" + super().__init__("evict") + self.path = BasicParameter(None, position=1) + class SystemSubCommand(CommandWithSubCommand): """Defines an object for the daos system subcommand.""" diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index 900da63ebf1..a938491a20f 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -391,13 +391,32 @@ def get_stats(self): cmd = f"daos filesystem query --json {self.mount_dir.value}" result = run_remote(self.log, self.hosts, cmd) if not result.passed: - raise CommandFailure(f'"fs query failed on {result.failed_hosts}') + raise CommandFailure(f"fs query failed on {result.failed_hosts}") data = json.loads("\n".join(result.output[0].stdout)) if data["status"] != 0 or data["error"] is not None: raise CommandFailure("fs query returned bad data.") return data["response"] + def get_log_file_data(self): + """Return the content of the log file for each clients + + Returns: + list: lines of the the DFuse log file for each clients + + Raises: + CommandFailure: on failure to get the DFuse log file + + """ + if not self.env.get("D_LOG_FILE"): + raise CommandFailure("get_log_file_data needs a DFuse log files to be defined") + + log_file = self.env["D_LOG_FILE"] + result = run_remote(self.log, self.hosts, f"cat {log_file}") + if not result.passed: + raise CommandFailure(f"Log file {log_file} can not be open on {result.failed_hosts}") + return result + def get_dfuse(test, hosts, namespace=None): """Get a new Dfuse instance. @@ -431,6 +450,7 @@ def start_dfuse(test, dfuse, pool=None, container=None, **params): Args: test (Test): the test instance + dfuse (Dfuse): the dfuse instance to start pool (TestPool, optional): pool to mount. Defaults to None container (TestContainer, optional): container to mount. Defaults to None params (Object, optional): Dfuse command arguments to update diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 932a9f7a306..5f687900fbc 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -494,6 +494,7 @@ def __init__(self, job, subprocess=False, mpi_type="openmpi"): self.tmpdir_base = FormattedParameter("--mca orte_tmpdir_base {}", None) self.args = BasicParameter(None, None) self.mpi_type = mpi_type + self.hostlist = FormattedParameter("-hosts {}", None) def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): """Assign the hosts to use with the command (-f). diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index 2d1c11e722e..879e142e872 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -25,10 +25,11 @@ from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, - create_racer_cmdline, ddhhmmss_format, get_harassers, - launch_exclude_reintegrate, launch_extend, launch_reboot, - launch_server_stop_start, launch_snapshot, launch_vmd_identify_check, - reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check) + create_racer_cmdline, ddhhmmss_format, debug_logging, get_harassers, + get_id, get_job_logs, job_cleanup, launch_exclude_reintegrate, + launch_extend, launch_jobscript, launch_reboot, launch_server_stop_start, + launch_snapshot, launch_vmd_identify_check, reserved_file_copy, + run_event_check, run_metrics_check, run_monitor_check) class SoakTestBase(TestWithServers): @@ -78,7 +79,11 @@ def __init__(self, *args, **kwargs): self.soak_log_dir = None self.soak_dir = None self.enable_scrubber = False + self.job_scheduler = None + self.joblist = None + self.enable_debug_msg = False self.enable_rebuild_logmasks = False + self.down_nodes = None def setUp(self): """Define test setup to be done.""" @@ -97,30 +102,29 @@ def setUp(self): self.sharedsoaktest_dir = self.sharedsoak_dir + "/pass" + str(self.loop) # Initialize dmg cmd self.dmg_command = self.get_dmg_command() - # Fail if slurm partition is not defined - # NOTE: Slurm reservation and partition are created before soak runs. - # CI uses partition=daos_client and no reservation. - # A21 uses partition=normal/default and reservation=daos-test. - # Partition and reservation names are updated in the yaml file. - # It is assumed that if there is no reservation (CI only), then all - # the nodes in the partition will be used for soak. - if not self.host_info.clients.partition.name: - raise SoakTestError( - "<>") - self.srun_params = {"partition": self.host_info.clients.partition.name} - if self.host_info.clients.partition.reservation: - self.srun_params["reservation"] = self.host_info.clients.partition.reservation - # Include test node for log cleanup; remove from client list + self.job_scheduler = self.params.get("job_scheduler", "/run/*", default="slurm") + # soak jobs do not run on the local node local_host_list = include_local_host(None) - self.slurm_exclude_nodes.add(local_host_list) if local_host_list[0] in self.hostlist_clients: self.hostlist_clients.remove((local_host_list[0])) if not self.hostlist_clients: - self.fail( - "There are no valid nodes in this partition to run " - "soak. Check partition {} for valid nodes".format( - self.host_info.clients.partition.name)) + self.fail("There are no valid nodes to run soak") + if self.job_scheduler == "slurm": + # Fail if slurm partition is not defined + # NOTE: Slurm reservation and partition are created before soak runs. + # CI uses partition=daos_client and no reservation. + # A21 uses partition=normal/default and reservation=daos-test. + # Partition and reservation names are updated in the yaml file. + # It is assumed that if there is no reservation (CI only), then all + # the nodes in the partition will be used for soak. + if not self.host_info.clients.partition.name: + raise SoakTestError( + "<>") + self.srun_params = {"partition": self.host_info.clients.partition.name} + if self.host_info.clients.partition.reservation: + self.srun_params["reservation"] = self.host_info.clients.partition.reservation + # Include test node for log cleanup; remove from client list + self.slurm_exclude_nodes.add(local_host_list) def pre_tear_down(self): """Tear down any test-specific steps prior to running tearDown(). @@ -133,7 +137,7 @@ def pre_tear_down(self): self.log.info("<> at %s", time.ctime()) errors = [] # clear out any jobs in squeue; - if self.failed_job_id_list: + if self.failed_job_id_list and self.job_scheduler == "slurm": job_id = " ".join([str(job) for job in self.failed_job_id_list]) self.log.info("<>", job_id) cmd = "scancel --partition {} -u {} {}".format( @@ -144,7 +148,8 @@ def pre_tear_down(self): if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) - + # cleanup any remaining jobs + job_cleanup(self.log, self.hostlist_clients) # verify reserved container data if self.resv_cont: final_resv_file = os.path.join(self.test_dir, "final", "resv_file") @@ -284,6 +289,123 @@ def harasser_job_done(self, args): self.harasser_results[args["name"]] = args["status"] self.harasser_args[args["name"]] = args["vars"] + def schedule_jobs(self, node_list): + """Schedule jobs with internal scheduler. + + Args: + node_list (list): list of nodes to use in jobs + """ + debug_logging(self.log, self.enable_debug_msg, "DBG: schedule_jobs ENTERED ") + job_queue = multiprocessing.Queue() + jobid_list = [] + jobs_not_done = [] + # remove any nodes marked as DOWN + node_list.difference_update(self.down_nodes) + path = os.getenv("PATH") + env = f"export PATH={path}" + lib_path = os.getenv("LD_LIBRARY_PATH") + if lib_path: + env = ";".join([env, f"export LD_LIBRARY_PATH={lib_path}"]) + v_env = os.getenv("VIRTUAL_ENV") + if v_env: + env = ";".join([env, f"export VIRTUAL_ENV={v_env}"]) + for job_dict in self.joblist: + jobid_list.append(job_dict["jobid"]) + jobs_not_done.append(job_dict["jobid"]) + self.log.info("Submitting %s jobs at %s", str(len(jobid_list)), time.ctime()) + job_threads = [] + while True: + if time.time() > self.end_time or len(jobs_not_done) == 0: + break + job_results = {} + # verify that there are enough nodes to run remaining jobs + if len(job_threads) == 0: + for job_dict in self.joblist: + job_id = job_dict["jobid"] + if job_id in jobs_not_done: + node_count = job_dict["nodesperjob"] + if len(node_list) < node_count: + # cancel job + self.soak_results.update({job_id: "CANCELLED"}) + self.log.info( + "FINAL STATE: soak job %s completed with : %s at %s", + job_id, + "CANCELLED", + time.ctime()) + jobs_not_done.remove(job_id) + for job_dict in self.joblist: + job_id = job_dict["jobid"] + if job_id in jobid_list: + node_count = job_dict["nodesperjob"] + if len(node_list) >= node_count: + debug_logging( + self.log, self.enable_debug_msg, f"DBG: node_count {node_count}") + debug_logging( + self.log, + self.enable_debug_msg, + f"DBG: node_list initial/queue {node_list}") + job_node_list = node_list[:node_count] + debug_logging( + self.log, + self.enable_debug_msg, + f"DBG: node_list before launch_job {node_list}") + script = job_dict["jobscript"] + timeout = job_dict["jobtimeout"] + log = job_dict["joblog"] + error_log = job_dict["joberrlog"] + method = launch_jobscript + params = (self.log, job_queue, job_id, job_node_list, + env, script, log, error_log, timeout, self) + name = f"SOAK JOB {job_id}" + _thread = threading.Thread( + target=method, args=params, name=name, daemon=True) + job_threads.append(_thread) + jobid_list.remove(job_id) + node_list = node_list[node_count:] + debug_logging( + self.log, + self.enable_debug_msg, + f"DBG: node_list after launch_job {node_list}") + + # Start this job + _thread.start() + + # If we don't process any results this time, we'll sleep before checking again + do_sleep = True + + # Keep reference only to threads that are still running + _alive_threads = [] + for job in job_threads: + if job.is_alive(): + _alive_threads.append(job) + continue + # join finished threads to be safe + job.join() + # Don't sleep - starting scheduling immediately + do_sleep = False + job_threads = _alive_threads + + # Process results, if any + while not job_queue.empty(): + job_results = job_queue.get() + # Results to return in queue + node_list.update(job_results["host_list"]) + self.down_nodes.update(job_results["down_nodes"]) + debug_logging(self.log, self.enable_debug_msg, "DBG: Updating soak results") + self.soak_results[job_results["handle"]] = job_results["state"] + job_done_id = job_results["handle"] + jobs_not_done.remove(job_done_id) + debug_logging( + self.log, + self.enable_debug_msg, + f"DBG: node_list returned from queue {node_list}") + + # Sleep to avoid spin lock + if do_sleep: + time.sleep(3) + + debug_logging(self.log, self.enable_debug_msg, "DBG: schedule_jobs EXITED ") + def job_setup(self, jobs, pool): """Create the cmdline needed to launch job. @@ -292,28 +414,27 @@ def job_setup(self, jobs, pool): pool (obj): TestPool obj Returns: - job_cmdlist: list of sbatch scripts that can be launched - by slurm job manager + job_cmdlist: list of dictionary of jobs that can be launched """ - job_cmdlist = [] self.log.info("<> at %s", self.test_name, time.ctime()) for job in jobs: - jobscript = [] + # list of all job scripts + jobscripts = [] + # command is a list of [sbatch_cmds, log_name] to create a single job script commands = [] - nodesperjob = self.params.get( - "nodesperjob", "/run/" + job + "/*", [1]) - taskspernode = self.params.get( - "taskspernode", "/run/" + job + "/*", [1]) + total_nodes = NodeSet(self.hostlist_clients) + if self.down_nodes: + total_nodes.difference_update(self.down_nodes) + nodesperjob = self.params.get("nodesperjob", "/run/" + job + "/*", [1]) + taskspernode = self.params.get("taskspernode", "/run/" + job + "/*", [1]) for npj in list(nodesperjob): # nodesperjob = -1 indicates to use all nodes in client hostlist if npj < 0: - npj = len(self.hostlist_clients) - if len(self.hostlist_clients) / npj < 1: - raise SoakTestError( - "<> at %s", self.test_name, time.ctime()) job_id_list = [] - # before submitting the jobs to the queue, check the job timeout; + # before starting jobs, check the job timeout; if time.time() > self.end_time: self.log.info("<< SOAK test timeout in Job Startup>>") return job_id_list - # job_cmdlist is a list of batch script files - for script in job_cmdlist: - try: - job_id = slurm_utils.run_slurm_script(self.log, str(script)) - except slurm_utils.SlurmFailed as error: - self.log.error(error) - # Force the test to exit with failure - job_id = None - if job_id: - self.log.info( - "<> at %s", - job_id, script, time.ctime()) - slurm_utils.register_for_job_results(job_id, self, max_wait=self.test_timeout) - # keep a list of the job_id's - job_id_list.append(int(job_id)) - else: - # one of the jobs failed to queue; exit on first fail for now. - err_msg = f"Slurm failed to submit job for {script}" - job_id_list = [] - raise SoakTestError(f"<>") + if self.job_scheduler == "slurm": + for job_dict in self.joblist: + script = job_dict["jobscript"] + try: + job_id = slurm_utils.run_slurm_script(self.log, str(script)) + except slurm_utils.SlurmFailed as error: + self.log.error(error) + # Force the test to exit with failure + job_id = None + if job_id: + self.log.info( + "<> at %s", job_id, script, time.ctime()) + slurm_utils.register_for_job_results(job_id, self, max_wait=self.test_timeout) + # Update Job_List with the job_id + job_dict["job_id"] = int(job_id) + job_id_list.append(int(job_id)) + else: + # one of the jobs failed to queue; exit on first fail for now. + err_msg = f"Job failed to run for {script}" + job_id_list = [] + raise SoakTestError(f"<>") + else: + for job_dict in self.joblist: + job_dict["jobid"] = get_id() + job_id_list.append(job_dict["jobid"]) + node_list = NodeSet(self.hostlist_clients) + node_list.difference_update(self.down_nodes) + # self.schedule_jobs() + method = self.schedule_jobs + name = "Job Scheduler" + params = (node_list, ) + scheduler = threading.Thread( + target=method, args=params, name=name, daemon=True) + scheduler.start() + return job_id_list def job_completion(self, job_id_list): @@ -385,8 +531,9 @@ def job_completion(self, job_id_list): failed_job_id_list: IDs of each job that failed in slurm """ - self.log.info( - "<> at %s", self.test_name, time.ctime()) + # pylint: disable=too-many-nested-blocks + + self.log.info("<> at %s", self.test_name, time.ctime()) harasser_interval = 0 failed_harasser_msg = None harasser_timer = time.time() @@ -395,21 +542,28 @@ def job_completion(self, job_id_list): since = journalctl_time() # loop time exists after the first pass; no harassers in the first pass if self.harasser_loop_time and self.harassers: - harasser_interval = self.harasser_loop_time / ( - len(self.harassers) + 1) + harasser_interval = self.harasser_loop_time / (len(self.harassers) + 1) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): - # wait for the jobs to complete. - # enter tearDown before hitting the avocado timeout + debug_logging( + self.log, self.enable_debug_msg, f"DBG: SOAK RESULTS 1 {self.soak_results}") + # wait for the jobs to complete unless test_timeout occurred if time.time() > self.end_time: - self.log.info( - "<< SOAK test timeout in Job Completion at %s >>", - time.ctime()) - for job in job_id_list: - if not slurm_utils.cancel_jobs(self.log, self.control, int(job)).passed: - self.fail(f"Error canceling Job {job}") + self.log.info("<< SOAK test timeout in Job Completion at %s >>", time.ctime()) + if self.job_scheduler == "slurm": + for job in job_id_list: + if not slurm_utils.cancel_jobs(self.log, self.control, int(job)).passed: + self.fail(f"Error canceling Job {job}") + else: + # update soak_results to include job id NOT run and set state = CANCELLED + for job in job_id_list: + if job not in self.soak_results: + self.soak_results.update({job: "CANCELLED"}) + self.log.info("FINAL STATE: soak job %s completed with : %s at %s", + job, "CANCELLED", time.ctime()) + break # monitor events every 15 min if datetime.now() > check_time: run_monitor_check(self) @@ -444,27 +598,14 @@ def job_completion(self, job_id_list): if failed_harasser_msg is not None: self.all_failed_harassers.append(failed_harasser_msg) # check for JobStatus = COMPLETED or CANCELLED (i.e. TEST TO) + debug_logging( + self.log, self.enable_debug_msg, f"DBG: SOAK RESULTS 2 {self.soak_results}") for job, result in list(self.soak_results.items()): if result in ["COMPLETED", "CANCELLED"]: job_id_list.remove(int(job)) else: - self.log.info( - "<< Job %s failed with status %s>>", job, result) - # gather all the logfiles for this pass and cleanup test nodes - cmd = f"/usr/bin/rsync -avtr --min-size=1B {self.soak_log_dir} {self.outputsoak_dir}/" - cmd2 = f"/usr/bin/rm -rf {self.soak_log_dir}" - if self.enable_remote_logging: - # Limit fan out to reduce burden on filesystem - result = run_remote(self.log, self.hostlist_clients, cmd, timeout=600, fanout=64) - if result.passed: - result = run_remote(self.log, self.hostlist_clients, cmd2, timeout=600) - if not result.passed: - self.log.error("Remote copy failed on %s", str(result.failed_hosts)) - # copy the local files; local host not included in hostlist_client - if not run_local(self.log, cmd, timeout=600).passed: - self.log.info("Local copy failed: %s", cmd) - if not run_local(self.log, cmd2, timeout=600).passed: - self.log.info("Local copy failed: %s", cmd2) + self.log.info("<< Job %s failed with status %s>>", job, result) + get_job_logs(self) self.soak_results = {} return job_id_list @@ -487,7 +628,8 @@ def execute_jobs(self, jobs, pools): SoakTestError """ - job_script_list = [] + jobid_list = [] + self.joblist = [] # Update the remote log directories from new loop/pass sharedsoaktest_dir = self.sharedsoak_dir + "/pass" + str(self.loop) outputsoaktest_dir = self.outputsoak_dir + "/pass" + str(self.loop) @@ -507,18 +649,15 @@ def execute_jobs(self, jobs, pools): else: self.soak_log_dir = sharedsoaktest_dir # create the batch scripts - job_script_list = self.job_setup(jobs, pools) - # randomize job list - random.seed(4) - random.shuffle(job_script_list) + self.job_setup(jobs, pools) # Gather the job_ids - job_id_list = self.job_startup(job_script_list) + jobid_list = self.job_startup() # Initialize the failed_job_list to job_list so that any # unexpected failures will clear the squeue in tearDown - self.failed_job_id_list = job_id_list + self.failed_job_id_list = jobid_list # Wait for jobs to finish and cancel/kill jobs if necessary - self.failed_job_id_list = self.job_completion(job_id_list) + self.failed_job_id_list = self.job_completion(jobid_list) # Log the failing job ID if self.failed_job_id_list: self.log.info( @@ -537,6 +676,7 @@ def run_soak(self, test_param): """ self.soak_results = {} + self.joblist = [] self.pool = [] self.container = [] self.harasser_results = {} @@ -547,6 +687,8 @@ def run_soak(self, test_param): self.soak_errors = [] self.check_errors = [] self.used = [] + self.down_nodes = NodeSet() + self.enable_debug_msg = self.params.get("enable_debug_msg", "/run/*", default=False) self.mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") self.mpi_module_use = self.params.get( "mpi_module_use", "/run/*", default="/usr/share/modulefiles") @@ -559,7 +701,7 @@ def run_soak(self, test_param): resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000) ignore_soak_errors = self.params.get("ignore_soak_errors", test_param + "*", False) self.enable_il = self.params.get("enable_intercept_lib", test_param + "*", False) - self.sudo_cmd = "sudo" if enable_sudo else "" + self.sudo_cmd = "sudo -n" if enable_sudo else "" self.enable_remote_logging = self.params.get( "enable_remote_logging", os.path.join(test_param, "*"), False) self.enable_scrubber = self.params.get( diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index c527e67fea8..7b5cfdb2608 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -5,25 +5,29 @@ """ # pylint: disable=too-many-lines +import getpass import os import random import re +import stat import threading import time -from itertools import product +from itertools import count, product -import slurm_utils from avocado.core.exceptions import TestFail from avocado.utils.distro import detect +from ClusterShell.NodeSet import NodeSet +from command_utils import command_as_user from command_utils_base import EnvironmentVariables from daos_racer_utils import DaosRacerCommand from data_mover_utils import DcpCommand, FsCopy from dfuse_utils import get_dfuse -from dmg_utils import get_storage_query_device_info, get_storage_query_device_uuids +from dmg_utils import (check_system_query_status, get_storage_query_device_info, + get_storage_query_device_uuids) from duns_utils import format_path from exception_utils import CommandFailure from fio_utils import FioCommand -from general_utils import (DaosTestError, check_ping, check_ssh, get_host_data, get_log_file, +from general_utils import (DaosTestError, check_ping, check_ssh, get_journalctl, get_log_file, get_random_bytes, get_random_string, list_to_str, pcmd, run_command, run_pcmd, wait_for_result) from ior_utils import IorCommand @@ -32,10 +36,11 @@ from mdtest_utils import MdtestCommand from oclass_utils import extract_redundancy_factor from pydaos.raw import DaosApiError, DaosSnapshot -from run_utils import run_remote +from run_utils import run_local, run_remote from test_utils_container import add_container H_LOCK = threading.Lock() +id_counter = count(start=1) def ddhhmmss_format(seconds): @@ -56,6 +61,27 @@ def ddhhmmss_format(seconds): "%H:%M:%S", time.gmtime(seconds % 86400))) +def get_id(): + """Increment a counter to generate job ids + + Returns: + int : next counter value + """ + return next(id_counter) + + +def debug_logging(log, enable_debug_msg, log_msg): + """Enable debug messages in log file. + + Args: + log (logger): logger for the messages produced by this method + enable_debug_msg (boolean): If true, the debug message will be written to log + log_msg (str): debug message to write to log + """ + if enable_debug_msg: + log.debug(log_msg) + + def add_pools(self, pool_names, ranks=None): """Create a list of pools that the various tests use for storage. @@ -181,7 +207,7 @@ def run_event_check(self, since, until): hosts = list(set(self.hostlist_servers)) if events: for journalctl_type in ["kernel", "daos_server"]: - for output in get_journalctl(self, hosts, since, until, journalctl_type): + for output in get_journalctl(hosts, since, until, journalctl_type): for event in events: lines = output["data"].splitlines() for line in lines: @@ -195,7 +221,7 @@ def run_event_check(self, since, until): return events_found -def get_journalctl(self, hosts, since, until, journalctl_type, logging=False): +def get_journalctl_logs(self, hosts, since, until, journalctl_type): """Run the journalctl on daos servers. Args: @@ -211,18 +237,14 @@ def get_journalctl(self, hosts, since, until, journalctl_type, logging=False): "data": data requested for the group of hosts """ - command = "{} /usr/bin/journalctl --system -t {} --since=\"{}\" --until=\"{}\"".format( - self.sudo_cmd, journalctl_type, since, until) - err = "Error gathering system log events" - results = get_host_data(hosts, command, "journalctl", err) + results = get_journalctl(hosts, since, until, journalctl_type) name = f"journalctl_{journalctl_type}.log" destination = self.outputsoak_dir - if logging: - for result in results: - for host in result["hosts"]: - log_name = name + "-" + str(host) - self.log.info("Logging %s output to %s", command, log_name) - write_logfile(result["data"], log_name, destination) + for result in results: + for host in result["hosts"]: + log_name = name + "-" + str(host) + self.log.info("Logging output to %s", log_name) + write_logfile(result["data"], log_name, destination) return results @@ -232,8 +254,8 @@ def get_daos_server_logs(self): Args: self (obj): soak obj """ - daos_dir = self.outputsoak_dir + "/daos_server_logs" - logs_dir = os.environ.get("DAOS_TEST_LOG_DIR", "/var/tmp/daos_testing/") + daos_dir = os.path.join(self.outputsoak_dir, "daos_server_logs") + logs_dir = os.path.join(self.test_env.log_dir, "*log*") hosts = self.hostlist_servers if not os.path.exists(daos_dir): os.mkdir(daos_dir) @@ -244,6 +266,34 @@ def get_daos_server_logs(self): raise SoakTestError(f"<>") from error +def get_job_logs(self): + """Gather all job logs for the current pass of soak.""" + + # gather all the logfiles for this pass and cleanup client nodes + cmd = f"/usr/bin/rsync -avtr --min-size=1B {self.soak_log_dir} {self.outputsoak_dir}/" + cmd2 = f"/usr/bin/rm -rf {self.soak_log_dir}" + if self.enable_remote_logging: + # Limit fan out to reduce burden on filesystem + result = run_remote(self.log, self.hostlist_clients, cmd, timeout=600, fanout=64) + if result.passed: + result = run_remote(self.log, self.hostlist_clients, cmd2, timeout=600) + if not result.passed: + self.log.error("Remote copy failed on %s", str(result.failed_hosts)) + # copy script files from shared dir + sharedscr_dir = self.sharedsoak_dir + "/pass" + str(self.loop) + cmd3 = f"/usr/bin/rsync -avtr --min-size=1B {sharedscr_dir} {self.outputsoak_dir}/" + cmd4 = f"/usr/bin/rm -rf {sharedscr_dir}" + if not run_local(self.log, cmd3, timeout=600).passed: + self.log.error("Script file copy failed with %s", cmd3) + if not run_local(self.log, cmd4, timeout=600).passed: + self.log.error("Script file copy failed with %s", cmd4) + # copy the local files; local host not included in hostlist_client + if not run_local(self.log, cmd, timeout=600).passed: + self.log.error("Local copy failed: %s", cmd) + if not run_local(self.log, cmd2, timeout=600).passed: + self.log.error("Local copy failed: %s", cmd2) + + def run_monitor_check(self): """Monitor server cpu, memory usage periodically. @@ -340,6 +390,108 @@ def wait_for_pool_rebuild(self, pool, name): return rebuild_status +def job_cleanup(log, hosts): + """Cleanup after job is done. + + Args: + log (logger): logger for the messages produced by this method + hosts (list): list of node to pass to job script + """ + current_user = getpass.getuser() + for job in ["mpirun", "palsd", "dfuse"]: + cmd = [f"/usr/bin/bash -c 'for pid in $(pgrep -u {current_user} {job})", + "do kill -HUP $pid", + "done'"] + run_remote( + log, hosts, ";".join(cmd), verbose=False, timeout=600, task_debug=False, stderr=False) + if job == "dfuse": + cmd2 = [ + "/usr/bin/bash -c 'for dir in $(find /tmp/soak_dfuse_*/)", + "do fusermount3 -uz $dir", + "rm -rf $dir", + "done'"] + run_remote(log, hosts, ";".join(cmd2), verbose=False, timeout=600, task_debug=False, + stderr=False) + + +def launch_jobscript( + log, job_queue, job_id, host_list, env, script, job_log, error_log, timeout, test): + """Launch the job script on remote node. + + Args: + log (logger): logger for the messages produced by this method + job_queue (Queue): job queue to post status of job + job_id (int): unique job identifier + host_list (list): list of node to pass to job script + env (str): environment variables for job script + script (str): full path to job script + job_log (str): job std out + error_log (str): job std error + timeout (int): job timeout + test (TestObj): soak test obj + """ + + debug_logging(log, test.enable_debug_msg, f"DBG: JOB {job_id} ENTERED launch_jobscript") + job_results = [] + node_results = [] + down_nodes = NodeSet() + state = "UNKNOWN" + if time.time() >= test.end_time: + results = {"handle": job_id, "state": "CANCELLED", "host_list": host_list} + debug_logging(log, test.enable_debug_msg, f"DBG: JOB {job_id} EXITED launch_jobscript") + job_queue.put(results) + return + if isinstance(host_list, str): + # assume one host in list + hosts = host_list + rhost = host_list + else: + hosts = ",".join(sorted(host_list)) + rhost = NodeSet(hosts)[0] + job_log1 = job_log.replace("JOBID", str(job_id)) + error_log1 = error_log.replace("JOBID", str(job_id)) + joblog = job_log1.replace("RHOST", str(rhost)) + errorlog = error_log1.replace("RHOST", str(rhost)) + cmd = ";".join([env, f"{script} {hosts} {job_id} {joblog} {errorlog}"]) + job_results = run_remote( + log, rhost, cmd, verbose=False, timeout=timeout * 60, task_debug=False, stderr=False) + if job_results: + if job_results.timeout: + state = "TIMEOUT" + elif job_results.passed: + state = "COMPLETED" + elif not job_results.passed: + state = "FAILED" + else: + state = "UNKNOWN" + # attempt to cleanup any leftover job processes on timeout + job_cleanup(log, hosts) + if time.time() >= test.end_time: + results = {"handle": job_id, "state": "CANCELLED", "host_list": host_list} + debug_logging(log, test.enable_debug_msg, f"DBG: JOB {job_id} EXITED launch_jobscript") + job_queue.put(results) + # give time to update the queue before exiting + time.sleep(0.5) + return + + # check if all nodes are available + cmd = f"ls {test.test_env.log_dir}" + node_results = run_remote(log, NodeSet(hosts), cmd, verbose=False) + if node_results.failed_hosts: + for node in node_results.failed_hosts: + host_list.remove(node) + down_nodes.update(node) + log.info(f"DBG: Node {node} is marked as DOWN in job {job_id}") + + log.info("FINAL STATE: soak job %s completed with : %s at %s", job_id, state, time.ctime()) + results = {"handle": job_id, "state": state, "host_list": host_list, "down_nodes": down_nodes} + debug_logging(log, test.enable_debug_msg, f"DBG: JOB {job_id} EXITED launch_jobscript") + job_queue.put(results) + # give time to update the queue before exiting + time.sleep(0.5) + return + + def launch_snapshot(self, pool, name): """Create a basic snapshot of the reserved pool. @@ -490,9 +642,12 @@ def launch_reboot(self, pools, name, results, args): # If all ranks "joined", issue reintegrate for all pool on all ranks and wait for # rebuild to complete # Update multiprocessing queue with results and args + # pylint: disable=too-many-nested-blocks,too-many-branches status = False params = {} ranks = None + reboot_host = None + ranklist = None if name == "REBOOT": reboot_host = self.random.choice(self.hostlist_servers) ranklist = self.server_managers[0].get_host_ranks(reboot_host) @@ -504,16 +659,16 @@ def launch_reboot(self, pools, name, results, args): self.log.info( "<<>>\n", self.loop, name, ranks, time.ctime()) # reboot host in 1 min - result = run_remote(self.log, reboot_host, "sudo shutdown -r +1") + result = run_remote(self.log, reboot_host, command_as_user("shutdown -r +1", "root")) if result.passed: status = True else: - self.log.error(f"<<>>\n", self.loop, name, reboot_host, - time.ctime()) - status = True - self.dmg_command.system_query() - # wait for node to complete rebooting - if not wait_for_result(self.log, check_ping, 60, 5, True, host=reboot_host, - expected_ping=True, cmd_timeout=60, verbose=True): - self.log.error(f"<<>>\n", - self.loop, name, reboot_host, time.ctime()) - cmd_results = run_remote(self.log, reboot_host, "sudo systemctl restart daos_server") - if cmd_results.passed: - self.dmg_command.system_query() - for pool in pools: - self.dmg_command.pool_query(pool.identifier) - # wait server to be started - try: - self.dmg_command.system_start(ranks=ranks) - except CommandFailure as error: - self.log.error("<<>>\n", self.loop, name, + reboot_host, time.ctime()) + status = True + self.dmg_command.system_query() + # wait for node to complete rebooting + if not wait_for_result(self.log, check_ping, 60, 5, True, host=reboot_host, + expected_ping=True, cmd_timeout=60, verbose=True): + self.log.error(f"<<>>\n", + self.loop, name, reboot_host, time.ctime()) + cmd_results = run_remote( + self.log, reboot_host, command_as_user("systemctl restart daos_server", "root")) + if cmd_results.passed: + self.dmg_command.system_query() + for pool in pools: + pool.query() + # wait server to be started try: - pool.reintegrate(rank) - status = True - except TestFail as error: - self.log.error( - f"<<>>", self.loop, name) - status = False + if status: + # Check the servers are in joined state. + all_joined = False + retry = 0 + while not all_joined and retry < 10: + all_joined = check_system_query_status( + self.get_dmg_command().system_query()) + retry += 1 + time.sleep(10) + if not all_joined: + self.log.error("<<>>", self.loop, name) + status = False params = {"name": name, "status": status, @@ -861,16 +1031,19 @@ def start_dfuse(self, pool, container, name=None, job_spec=None): dfuselog = os.path.join( self.soak_log_dir, self.test_name + "_" + name + "_`hostname -s`_" - "" + "${SLURM_JOB_ID}_" + "daos_dfuse.log") - dfuse_env = f"export D_LOG_FILE_APPEND_PID=1;export D_LOG_MASK=ERR;export D_LOG_FILE={dfuselog}" - module_load = f"module use {self.mpi_module_use};module load {self.mpi_module}" + "" + "${JOB_ID}_" + "daos_dfuse.log") + dfuse_env = ";".join( + ["export D_LOG_FILE_APPEND_PID=1", + "export D_LOG_MASK=ERR", + f"export D_LOG_FILE={dfuselog}"]) + module_load = ";".join([f"module use {self.mpi_module_use}", f"module load {self.mpi_module}"]) dfuse_start_cmds = [ - "clush -S -w $SLURM_JOB_NODELIST \"mkdir -p {}\"".format(dfuse.mount_dir.value), - "clush -S -w $SLURM_JOB_NODELIST \"cd {};{};{};{}\"".format( + "clush -S -w $HOSTLIST \"mkdir -p {}\"".format(dfuse.mount_dir.value), + "clush -S -w $HOSTLIST \"cd {};{};{};{}\"".format( dfuse.mount_dir.value, dfuse_env, module_load, str(dfuse)), "sleep 10", - "clush -S -w $SLURM_JOB_NODELIST \"df -h {}\"".format(dfuse.mount_dir.value), + "clush -S -w $HOSTLIST \"df -h {}\"".format(dfuse.mount_dir.value), ] return dfuse, dfuse_start_cmds @@ -892,8 +1065,8 @@ def stop_dfuse(dfuse, vol=False): dfuse.mount_dir.value)]) dfuse_stop_cmds.extend([ - "clush -S -w $SLURM_JOB_NODELIST \"fusermount3 -uz {0}\"".format(dfuse.mount_dir.value), - "clush -S -w $SLURM_JOB_NODELIST \"rm -rf {0}\"".format(dfuse.mount_dir.value)]) + f'clush -S -w $HOSTLIST "fusermount3 -uz {dfuse.mount_dir.value}"', + f'clush -S -w $HOSTLIST "rm -rf {dfuse.mount_dir.value}"']) return dfuse_stop_cmds @@ -989,7 +1162,7 @@ def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob, oclass_list=None, file_dir_oclass[0], nodesperjob * ppn, nodesperjob, ppn) daos_log = os.path.join( self.soak_log_dir, self.test_name + "_" + log_name - + "_`hostname -s`_${SLURM_JOB_ID}_daos.log") + + "_`hostname -s`_${JOB_ID}_daos.log") env = ior_cmd.get_default_env("mpirun", log_file=daos_log) env["D_LOG_FILE_APPEND_PID"] = "1" sbatch_cmds = [f"module use {self.mpi_module_use}", f"module load {self.mpi_module}"] @@ -1009,17 +1182,21 @@ def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob, oclass_list=None, # add envs if api is HDF5-VOL if api == "HDF5-VOL": vol = True + cont_props = container.properties.value + env["HDF5_DAOS_FILE_PROP"] = '"' + cont_props.replace(",", ";") + '"' + env["HDF5_DAOS_OBJ_CLASS"] = file_dir_oclass[0] env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = str(plugin_path) mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.assign_environment(env, True) mpirun_cmd.ppn.update(ppn) + mpirun_cmd.hostlist.update("$HOSTLIST") sbatch_cmds.append(str(mpirun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX", "POSIX-LIBPIL4DFS", "POSIX-LIBIOIL"]: sbatch_cmds.extend(stop_dfuse(dfuse, vol)) commands.append([sbatch_cmds, log_name]) - self.log.info(f"<>:") + self.log.info(f"<>: ") for cmd in sbatch_cmds: self.log.info(cmd) return commands @@ -1059,13 +1236,13 @@ def create_macsio_cmdline(self, job_spec, pool, ppn, nodesperjob): job_spec, api, file_oclass, nodesperjob * ppn, nodesperjob, ppn) daos_log = os.path.join( self.soak_log_dir, self.test_name - + "_" + log_name + "_`hostname -s`_${SLURM_JOB_ID}_daos.log") + + "_" + log_name + "_`hostname -s`_${JOB_ID}_daos.log") macsio_log = os.path.join( self.soak_log_dir, self.test_name - + "_" + log_name + "_`hostname -s`_${SLURM_JOB_ID}_macsio-log.log") + + "_" + log_name + "_`hostname -s`_${JOB_ID}_macsio-log.log") macsio_timing_log = os.path.join( self.soak_log_dir, self.test_name - + "_" + log_name + "_`hostname -s`_${SLURM_JOB_ID}_macsio-timing.log") + + "_" + log_name + "_`hostname -s`_${JOB_ID}_macsio-timing.log") macsio.log_file_name.update(macsio_log) macsio.timings_file_name.update(macsio_timing_log) env = macsio.env.copy() @@ -1087,12 +1264,13 @@ def create_macsio_cmdline(self, job_spec, pool, ppn, nodesperjob): mpirun_cmd.working_dir.update(dfuse.mount_dir.value) mpirun_cmd.assign_environment(env, True) mpirun_cmd.ppn.update(ppn) + mpirun_cmd.hostlist.update("$HOSTLIST") sbatch_cmds.append(str(mpirun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL"]: sbatch_cmds.extend(stop_dfuse(dfuse, vol=True)) commands.append([sbatch_cmds, log_name]) - self.log.info("<>:") + self.log.info("<>: ") for cmd in sbatch_cmds: self.log.info(cmd) return commands @@ -1158,7 +1336,7 @@ def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob): ppn) daos_log = os.path.join( self.soak_log_dir, self.test_name + "_" + log_name - + "_`hostname -s`_${SLURM_JOB_ID}_daos.log") + + "_`hostname -s`_${JOB_ID}_daos.log") env = mdtest_cmd.get_default_env("mpirun", log_file=daos_log) env["D_LOG_FILE_APPEND_PID"] = "1" sbatch_cmds = [f"module use {self.mpi_module_use}", f"module load {self.mpi_module}"] @@ -1178,12 +1356,13 @@ def create_mdtest_cmdline(self, job_spec, pool, ppn, nodesperjob): mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.assign_environment(env, True) mpirun_cmd.ppn.update(ppn) + mpirun_cmd.hostlist.update("$HOSTLIST") sbatch_cmds.append(str(mpirun_cmd)) sbatch_cmds.append("status=$?") if api in ["POSIX", "POSIX-LIBPIL4DFS", "POSIX-LIBIOIL"]: sbatch_cmds.extend(stop_dfuse(dfuse)) commands.append([sbatch_cmds, log_name]) - self.log.info(f"<>:") + self.log.info(f"<>: ") for cmd in sbatch_cmds: self.log.info(cmd) return commands @@ -1213,7 +1392,7 @@ def create_racer_cmdline(self, job_spec): racer_log = os.path.join( self.soak_log_dir, self.test_name + "_" + job_spec + "_`hostname -s`_" - "${SLURM_JOB_ID}_" + "racer_log") + "${JOB_ID}_" + "racer_log") daos_racer.env["D_LOG_FILE"] = get_log_file(racer_log) log_name = job_spec cmds = [] @@ -1221,7 +1400,7 @@ def create_racer_cmdline(self, job_spec): cmds.append("status=$?") # add exit code commands.append([cmds, log_name]) - self.log.info("<>:") + self.log.info("<>: ") for cmd in cmds: self.log.info(cmd) return commands @@ -1307,7 +1486,7 @@ def create_fio_cmdline(self, job_spec, pool): cmds.append("cd -") cmds.extend(stop_dfuse(dfuse)) commands.append([cmds, log_name]) - self.log.info("<>:") + self.log.info("<>: ") for cmd in cmds: self.log.info(cmd) return commands @@ -1341,14 +1520,14 @@ def create_app_cmdline(self, job_spec, pool, ppn, nodesperjob): # ${DAOS_TEST_APP_SRC}/suse => apps built with suse and gnu-mpich # pylint: disable-next=wrong-spelling-in-comment,fixme # ${DAOS_TEST_APP_SRC}/suse/intelmpi => apps built with suse and intelmpi - if "suse" in detect().name.lower(): + if "suse" in detect().name.lower() and os.environ.get("DAOS_TEST_MODE") is None: os.environ["DAOS_TEST_APP_DIR"] += os.path.join(os.sep, "suse") - if "mpi/latest" in mpi_module: + if "mpi/latest" in mpi_module and os.environ.get("DAOS_TEST_MODE") is None: os.environ["DAOS_TEST_APP_DIR"] += os.path.join(os.sep, "intelmpi") os.environ["I_MPI_OFI_LIBRARY_INTERNAL"] = "0" app_cmd = os.path.expandvars(self.params.get("cmdline", app_params, default=None)) if app_cmd is None: - self.log.info(f"<<{job_spec} command line not specified in yaml; job will not be run>>") + self.log.info(f"<<{job_spec} command line not specified in yaml>>") return commands oclass_list = self.params.get("oclass", app_params) for file_oclass, dir_oclass in oclass_list: @@ -1378,6 +1557,7 @@ def create_app_cmdline(self, job_spec, pool, ppn, nodesperjob): mpirun_cmd.assign_environment(env, True) mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.ppn.update(ppn) + mpirun_cmd.hostlist.update("$HOSTLIST") if api in ["POSIX", "POSIX-LIBIOIL", "POSIX-LIBPIL4DFS"]: mpirun_cmd.working_dir.update(dfuse.mount_dir.value) cmdline = str(mpirun_cmd) @@ -1386,7 +1566,7 @@ def create_app_cmdline(self, job_spec, pool, ppn, nodesperjob): if api in ["POSIX", "POSIX-LIBIOIL", "POSIX-LIBPIL4DFS"]: sbatch_cmds.extend(stop_dfuse(dfuse)) commands.append([sbatch_cmds, log_name]) - self.log.info(f"<<{job_spec.upper()} cmdlines>>:") + self.log.info(f"<<{job_spec.upper()} cmdlines>>: ") for cmd in sbatch_cmds: self.log.info("%s", cmd) if mpi_module != self.mpi_module: @@ -1428,7 +1608,7 @@ def create_dm_cmdline(self, job_spec, pool, ppn, nodesperjob): dcp_cmd.set_params(src=src_file, dst=dst_file) env_vars = { "D_LOG_FILE": os.path.join(self.soak_log_dir, self.test_name + "_" - + log_name + "_`hostname -s`_${SLURM_JOB_ID}_daos.log"), + + log_name + "_`hostname -s`_${JOB_ID}_daos.log"), "D_LOG_FILE_APPEND_PID": "1" } mpirun_cmd = Mpirun(dcp_cmd, mpi_type=self.mpi_module) @@ -1436,6 +1616,7 @@ def create_dm_cmdline(self, job_spec, pool, ppn, nodesperjob): mpirun_cmd.assign_processes(nodesperjob * ppn) mpirun_cmd.assign_environment(EnvironmentVariables(env_vars), True) mpirun_cmd.ppn.update(ppn) + mpirun_cmd.hostlist.update("$HOSTLIST") sbatch_cmds.append(str(mpirun_cmd)) sbatch_cmds.append("status=$?") @@ -1443,7 +1624,7 @@ def create_dm_cmdline(self, job_spec, pool, ppn, nodesperjob): dm_commands = create_ior_cmdline( self, ior_spec, pool, ppn, nodesperjob, [[file_oclass, dir_oclass]], cont_2) sbatch_cmds.extend(dm_commands[0][0]) - self.log.info("<>:") + self.log.info("<>: ") for cmd in sbatch_cmds: self.log.info("%s", cmd) commands.append([sbatch_cmds, log_name]) @@ -1451,52 +1632,115 @@ def create_dm_cmdline(self, job_spec, pool, ppn, nodesperjob): def build_job_script(self, commands, job, nodesperjob, ppn): - """Create a slurm batch script that will execute a list of cmdlines. + """Generate a script that will execute a list of commands. Args: - self (obj): soak obj - commands(list): command lines and cmd specific log_name - job(str): the job name that will be defined in the slurm script + path (str): where to write the script file + name (str): job name + output (str): where to put the output (full path) + nodecount (int): number of compute nodes to execute on + cmds (list): shell commands that are to be executed + uniq (str): a unique string to append to the job and log files + sbatch_params (dict, optional): dictionary containing other less often used parameters to + sbatch, e.g. mem:100. Defaults to None. + + Raises: + SoakTestError: if missing require parameters for the job script Returns: - script_list: list of slurm batch scripts + str: the full path of the script """ - job_timeout = self.params.get("job_timeout", "/run/" + job + "/*", 10) - self.log.info("<> at %s", time.ctime()) + self.log.info("<> at %s", time.ctime()) script_list = [] - # if additional cmds are needed in the batch script + # Additional commands needed in the job script prepend_cmds = ["set +e", "echo Job_Start_Time `date \\+\"%Y-%m-%d %T\"`", "daos pool query {} ".format(self.pool[1].identifier), "daos pool query {} ".format(self.pool[0].identifier)] + append_cmds = ["daos pool query {} ".format(self.pool[1].identifier), "daos pool query {} ".format(self.pool[0].identifier), "echo Job_End_Time `date \\+\"%Y-%m-%d %T\"`"] exit_cmd = ["exit $status"] - # Create the sbatch script for each list of cmdlines + for cmd, log_name in commands: - if isinstance(cmd, str): - cmd = [cmd] - output = os.path.join( - self.soak_log_dir, self.test_name + "_" + log_name + "_%N_" + "%j_") - error = os.path.join(str(output) + "ERROR_") - sbatch = { - "time": str(job_timeout) + ":00", - "exclude": str(self.slurm_exclude_nodes), - "error": str(error), - "export": "ALL", - "exclusive": None, - "ntasks": str(nodesperjob * ppn) - } - # include the cluster specific params - sbatch.update(self.srun_params) unique = get_random_string(5, self.used) - script = slurm_utils.write_slurm_script( - self.soak_log_dir, job, output, nodesperjob, - prepend_cmds + cmd + append_cmds + exit_cmd, unique, sbatch) - script_list.append(script) self.used.append(unique) + if isinstance(cmd, str): + cmd = [cmd] + if self.job_scheduler == "slurm": + job_timeout = self.params.get("job_timeout", "/run/" + job + "/*", 10) + job_log = os.path.join( + self.soak_log_dir, self.test_name + "_" + log_name + "_%N_" + "%j_") + output = job_log + unique + error = job_log + "ERROR_" + unique + sbatch_params = { + "time": str(job_timeout) + ":00", + "exclude": str(self.slurm_exclude_nodes), + "error": str(error), + "export": "ALL", + "exclusive": None, + "ntasks": str(nodesperjob * ppn) + } + # include the cluster specific params + sbatch_params.update(self.srun_params) + else: + job_log = os.path.join( + self.soak_log_dir, self.test_name + "_" + log_name + "_RHOST" + "_JOBID_") + output = job_log + unique + error = job_log + "ERROR_" + unique + + job_cmds = prepend_cmds + cmd + append_cmds + exit_cmd + # Write script file to shared dir + sharedscript_dir = self.sharedsoak_dir + "/pass" + str(self.loop) + scriptfile = sharedscript_dir + '/jobscript' + "_" + str(unique) + ".sh" + with open(scriptfile, 'w') as script_file: + script_file.write("#!/bin/bash\n#\n") + if self.job_scheduler == "slurm": + # write the slurm directives in the job script + script_file.write("#SBATCH --job-name={}\n".format(job)) + script_file.write("#SBATCH --nodes={}\n".format(nodesperjob)) + script_file.write("#SBATCH --distribution=cyclic\n") + script_file.write("#SBATCH --output={}\n".format(output)) + if sbatch_params: + for key, value in list(sbatch_params.items()): + if value is not None: + script_file.write("#SBATCH --{}={}\n".format(key, value)) + else: + script_file.write("#SBATCH --{}\n".format(key)) + script_file.write("\n") + script_file.write("if [ -z \"$VIRTUAL_ENV\" ]; then \n") + script_file.write(" echo \"VIRTUAL_ENV not defined\" \n") + script_file.write("else \n") + script_file.write(" source $VIRTUAL_ENV/bin/activate \n") + script_file.write("fi \n") + script_file.write("HOSTLIST=`nodeset -e -S \",\" $SLURM_JOB_NODELIST` \n") + script_file.write("JOB_ID=$SLURM_JOB_ID \n") + script_file.write("echo \"SLURM NODES: $SLURM_JOB_NODELIST \" \n") + script_file.write("echo \"NODE COUNT: $SLURM_JOB_NUM_NODES \" \n") + script_file.write("echo \"JOB ID: $JOB_ID \" \n") + script_file.write("echo \"HOSTLIST: $HOSTLIST \" \n") + script_file.write("\n") + else: + script_file.write("HOSTLIST=$1 \n") + script_file.write("JOB_ID=$2 \n") + script_file.write("JOB_LOG=$3 \n") + script_file.write("JOB_ERROR_LOG=$4 \n") + script_file.write("echo \"JOB NODES: $HOSTLIST \" \n") + script_file.write("echo \"JOB ID: $JOB_ID \" \n") + script_file.write("if [ -z \"$VIRTUAL_ENV\" ]; then \n") + script_file.write(" echo \"VIRTUAL_ENV not defined\" \n") + script_file.write("else \n") + script_file.write(" source $VIRTUAL_ENV/bin/activate \n") + script_file.write("fi \n") + script_file.write("exec 1> $JOB_LOG \n") + script_file.write("exec 2> $JOB_ERROR_LOG \n") + + for cmd in list(job_cmds): + script_file.write(cmd + "\n") + os.chmod(scriptfile, stat.S_IXUSR | stat.S_IRUSR) + script_list.append([scriptfile, output, error]) return script_list diff --git a/src/tests/ftest/util/test_utils_container.py b/src/tests/ftest/util/test_utils_container.py index 98729bba95e..6dc3dd40736 100644 --- a/src/tests/ftest/util/test_utils_container.py +++ b/src/tests/ftest/util/test_utils_container.py @@ -371,6 +371,7 @@ def __init__(self, pool, daos_command, label_generator=None, namespace=CONT_NAME self.daos_timeout = BasicParameter(None) self.label = BasicParameter(None, "TestContainer") self.label_generator = label_generator + self.attrs = BasicParameter(None) self.register_cleanup = BasicParameter(True, True) # call register_cleanup by default @@ -523,7 +524,8 @@ def create(self, con_in=None, query_id=None): "chunk_size": self.chunk_size.value, "properties": self.properties.value, "acl_file": self.acl_file.value, - "label": self.label.value + "label": self.label.value, + "attrs": self.attrs.value } self._log_method("daos.container_create", kwargs) result = self.daos.container_create(**kwargs) diff --git a/src/tests/ftest/util/test_utils_pool.py b/src/tests/ftest/util/test_utils_pool.py index f5e88d2c26c..e75510ec8b5 100644 --- a/src/tests/ftest/util/test_utils_pool.py +++ b/src/tests/ftest/util/test_utils_pool.py @@ -1448,11 +1448,11 @@ def check_pool_files(self, hosts, uuid, scm_mount): status = False return status - def wait_pool_suspect_ranks(self, expected, interval=1, timeout=30): - """Wait for the pool suspect ranks. + def wait_pool_dead_ranks(self, expected, interval=1, timeout=30): + """Wait for the pool dead ranks. Args: - expected (list): suspect ranks check to wait. + expected (list): dead ranks check to wait. interval (int, optional): number of seconds to wait in between pool query checks timeout(int, optional): time to fail test if it could not match expected values. @@ -1461,19 +1461,19 @@ def wait_pool_suspect_ranks(self, expected, interval=1, timeout=30): DaosTestError: if waiting for timeout. """ - self.log.info("waiting for pool ranks %s to be suspected", expected) + self.log.info("waiting for pool ranks %s to be marked dead", expected) start = time() data = self.dmg.pool_query(self.identifier, health_only=True) - while data['response'].get('suspect_ranks') != expected: - self.log.info(" suspect ranks is %s ...", data['response'].get('suspect_ranks')) + while data['response'].get('dead_ranks') != expected: + self.log.info(" dead ranks is %s ...", data['response'].get('dead_ranks')) if time() - start > timeout: raise DaosTestError("TIMEOUT detected after {} seconds while for waiting " - "for ranks {} suspect".format(timeout, expected)) + "for ranks {} dead".format(timeout, expected)) sleep(interval) data = self.dmg.pool_query(self.identifier, health_only=True) - self.log.info("Wait for suspect ranks complete: suspect ranks %s", expected) + self.log.info("Wait for dead ranks complete: dead ranks %s", expected) def verify_uuid_directory(self, host, scm_mount): """Check if pool folder exist on server. diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 85ade4754de..12b4a9958dc 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -1945,8 +1945,6 @@ cr_pause(void **state, bool force) int rc; int i; - FAULT_INJECTION_REQUIRED(); - rc = cr_pool_create(state, &pool, false, class); assert_rc_equal(rc, 0); @@ -3675,6 +3673,8 @@ cr_maintenance_mode(void **state) struct daos_check_info dci = { 0 }; int rc; + FAULT_INJECTION_REQUIRED(); + print_message("CR28: maintenance mode after dry-run check\n"); rc = cr_pool_create(state, &pool, true, TCC_NONE); diff --git a/src/tests/suite/dfs_sys_unit_test.c b/src/tests/suite/dfs_sys_unit_test.c index 47ba8ee20ed..42e3d3f8303 100644 --- a/src/tests/suite/dfs_sys_unit_test.c +++ b/src/tests/suite/dfs_sys_unit_test.c @@ -170,6 +170,12 @@ dfs_sys_test_create_remove(void **state) rc = dfs_sys_symlink(dfs_sys_mt, sym1_target, sym1); assert_int_equal(rc, 0); + /** remove should return ENOTSUP with force since caching is enabled */ + rc = dfs_sys_remove(dfs_sys_mt, dir1, true, NULL); + assert_int_equal(rc, ENOTSUP); + rc = dfs_sys_remove_type(dfs_sys_mt, dir1, true, S_IFDIR, NULL); + assert_int_equal(rc, ENOTSUP); + /** Remove dirs, links with remove */ rc = dfs_sys_remove(dfs_sys_mt, sym1, 0, 0); assert_int_equal(rc, 0); @@ -212,20 +218,18 @@ dfs_sys_test_create_remove(void **state) rc = dfs_sys_close(obj); assert_int_equal(rc, 0); - /** Remove files with remove */ - rc = dfs_sys_remove(dfs_sys_mt, file2, 0, 0); - assert_int_equal(rc, 0); - /** Remove dirs, files, links with remove_type */ rc = dfs_sys_remove_type(dfs_sys_mt, file1, false, S_IFREG, NULL); assert_int_equal(rc, 0); + rc = dfs_sys_remove_type(dfs_sys_mt, file2, false, S_IFREG, NULL); + assert_int_equal(rc, 0); rc = dfs_sys_remove_type(dfs_sys_mt, sym1, false, S_IFLNK, NULL); assert_int_equal(rc, 0); rc = dfs_sys_remove_type(dfs_sys_mt, dir3, false, S_IFDIR, NULL); assert_int_equal(rc, 0); - - /** Remove dirs with remove_type(force) */ - rc = dfs_sys_remove_type(dfs_sys_mt, dir1, true, S_IFDIR, NULL); + rc = dfs_sys_remove_type(dfs_sys_mt, dir2, false, S_IFDIR, NULL); + assert_int_equal(rc, 0); + rc = dfs_sys_remove_type(dfs_sys_mt, dir1, false, S_IFDIR, NULL); assert_int_equal(rc, 0); /** Create dirs, files with mknod */ @@ -242,8 +246,14 @@ dfs_sys_test_create_remove(void **state) 0, 0); assert_int_equal(rc, 0); - /** Remove tree (dir) with remove(force) */ - rc = dfs_sys_remove(dfs_sys_mt, dir1, true, NULL); + /** Remove tree with remove */ + rc = dfs_sys_remove(dfs_sys_mt, file1, false, NULL); + assert_int_equal(rc, 0); + rc = dfs_sys_remove(dfs_sys_mt, dir3, false, NULL); + assert_int_equal(rc, 0); + rc = dfs_sys_remove(dfs_sys_mt, dir2, false, NULL); + assert_int_equal(rc, 0); + rc = dfs_sys_remove(dfs_sys_mt, dir1, false, NULL); assert_int_equal(rc, 0); } @@ -640,7 +650,14 @@ dfs_sys_test_open_readdir(void **state) rc = dfs_sys_closedir(dirp); assert_int_equal(rc, 0); - rc = dfs_sys_remove(dfs_sys_mt, dir1, true, NULL); + for (i = 0; i < num_dirs; i++) { + rc = snprintf(buf, buf_size, "%s/sub%u", dir1, i); + assert_true(rc > 0); + rc = dfs_sys_remove(dfs_sys_mt, buf, false, 0); + assert_int_equal(rc, 0); + } + rc = dfs_sys_remove(dfs_sys_mt, dir1, false, NULL); + assert_int_equal(rc, 0); } /** @@ -848,11 +865,11 @@ dfs_sys_test_mkdir(void **state) rc = dfs_sys_mkdir(dfs_sys_mt, file, S_IWUSR | S_IRUSR, 0); assert_int_equal(rc, EEXIST); - rc = dfs_sys_remove(dfs_sys_mt, file, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, file, false, NULL); assert_int_equal(rc, 0); - rc = dfs_sys_remove(dfs_sys_mt, child, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, child, false, NULL); assert_int_equal(rc, 0); - rc = dfs_sys_remove(dfs_sys_mt, parent, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, parent, false, NULL); assert_int_equal(rc, 0); } @@ -885,11 +902,11 @@ dfs_sys_test_mkdir_p(void **state) rc = dfs_sys_mkdir_p(dfs_sys_mt, file, S_IWUSR | S_IRUSR, 0); assert_int_equal(rc, EEXIST); - rc = dfs_sys_remove(dfs_sys_mt, file, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, file, false, NULL); assert_int_equal(rc, 0); - rc = dfs_sys_remove(dfs_sys_mt, child, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, child, false, NULL); assert_int_equal(rc, 0); - rc = dfs_sys_remove(dfs_sys_mt, parent, true, NULL); + rc = dfs_sys_remove(dfs_sys_mt, parent, false, NULL); assert_int_equal(rc, 0); } diff --git a/src/vos/vos_iterator.c b/src/vos/vos_iterator.c index 3312721829b..30baae492e5 100644 --- a/src/vos/vos_iterator.c +++ b/src/vos/vos_iterator.c @@ -589,6 +589,10 @@ static inline void reset_anchors(vos_iter_type_t type, struct vos_iter_anchors *anchors) { switch (type) { + case VOS_ITER_OBJ: + daos_anchor_set_zero(&anchors->ia_obj); + anchors->ia_reprobe_obj = 0; + /* fall through */ case VOS_ITER_DKEY: daos_anchor_set_zero(&anchors->ia_dkey); anchors->ia_reprobe_dkey = 0; @@ -1111,6 +1115,7 @@ vos_iterate_obj(vos_iter_param_t *param, bool recursive, struct vos_iter_anchors DL_ERROR(rc, "Iterate bucket:%u failed.", i); break; } + reset_anchors(VOS_ITER_OBJ, anchors); } D_DEBUG(DB_TRACE, "Iterate %u/%u buckets.\n", iter_cnt, bkt_iter->bi_bkt_tot); diff --git a/utils/ansible/ftest/templates/daos-launch.sh.j2 b/utils/ansible/ftest/templates/daos-launch.sh.j2 index cdf7fe6704b..d9e6bf2f9c3 100644 --- a/utils/ansible/ftest/templates/daos-launch.sh.j2 +++ b/utils/ansible/ftest/templates/daos-launch.sh.j2 @@ -175,7 +175,7 @@ function cleanup mountpoint="/mnt/daos$index" if run ssh $host sudo mountpoint -q "$mountpoint" ; then info "Cleaning mount points $mountpoint of host $host" - run ssh $host sudo m -fr $mountpoint/* + run ssh $host sudo rm -fr $mountpoint/* run ssh $host sudo umount $mountpoint fi done diff --git a/utils/cq/check_update_copyright.sh b/utils/cq/check_update_copyright.sh new file mode 100755 index 00000000000..5bb4016915f --- /dev/null +++ b/utils/cq/check_update_copyright.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# +# Copyright 2024 Intel Corporation. +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# +# Check or update copyright date in modified files. +# Usage: check_update_copyright.sh +# mode "githook" will update copyright dates in place. +# mode "gha" will just print a warning in a GHA-compatible format. + +set -e + +git_target="$1" +mode="$2" +case "$mode" in + "githook" | "gha") + ;; + *) + echo "Usage: check_update_copyright.sh " + exit 1 +esac + +# Navigate to repo root +PARENT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +cd "$PARENT_DIR"/../../ + + +regex='(^[[:blank:]]*[\*/]*.*)((Copyright[[:blank:]]*)([0-9]{4})(-([0-9]{4}))?)([[:blank:]]*(Intel.*$))' +year=$(date +%Y) +errors=0 +targets=( + # Entries with wildcard. These must be first and start with '*' or + # older versions of git will return files that were not changed. + '*.c' + '*.h' + '*.go' + '*.py' + '*.proto' + '*.java' + '*.yml' + '*.yaml' + '*.sh' + '*.bash' + '*Dockerfile*' + '*README*' + '*LICENSE*' + '*NOTICE*' + '*.txt' + '*.md' + # Entries without a wildcard + 'Makefile' + 'Jenkinsfile' + 'SConscript' + 'SConstruct' + 'copyright' + '.env' +) + +if [ -z "$files" ]; then + files=$(git diff "$git_target" --cached --diff-filter=AM --name-only -- "${targets[@]}") +else + echo " Checking against custom files" +fi + +os=$(uname -s) + +. utils/githooks/git-version.sh + +for file in $files; do + if [[ "$file" == *vendor* ]] || [[ "$file" == *pb.go ]] || + [[ "$file" == *_string.go ]] || [[ "$file" == *pb-c* ]] || + { [ "$mode" == "githook" ] && + [ "$git_vercode" -ge 2030000 ] && + [ "$(git diff --cached -I Copyright "$file")" = '' ]; }; then + continue + fi + read -r y1 y2 <<< "$(sed -nre "s/^.*$regex.*$/\4 \6/p" "$file")" + if [[ -z $y1 ]] ; then + # Print warning but don't error on non-existent copyright + echo " Copyright Information not found in: $file" + elif [[ $y1 -ne $year && $year -ne $y2 ]] ; then + if [[ "$mode" == "githook" ]]; then + # Update copyright in place + if ! git reset "$file"; then + echo " Unable to un-stage $file" + errors=$((errors + 1)) + fi + if [[ "$os" == 'Linux' ]]; then + sed -i -re "s/$regex/\1Copyright $y1-$year \8/" "$file" + else + sed -i '' -re "s/$regex/\1Copyright $y1-$year \8/" "$file" + fi + + if ! git add "$file"; then + echo " Unable to re-stage $file" + errors=$((errors + 1)) + fi + elif [[ "$mode" == "gha" ]]; then + # Print error but do not update + lineno="$(grep -nE "$regex" "$file" | cut -f1 -d:)" + echo "::error file=$file,line=$lineno::Copyright out of date" + errors=$((errors + 1)) + fi + fi +done + +if [[ $errors -ne 0 ]]; then + echo " $errors errors while checking/fixing copyrights." + exit 1 +fi diff --git a/utils/githooks/pre-commit.d/10-update-copyright b/utils/githooks/pre-commit.d/10-update-copyright deleted file mode 100755 index e2641848cd7..00000000000 --- a/utils/githooks/pre-commit.d/10-update-copyright +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# -# Copyright 2022-2024 Intel Corporation. -# -# SPDX-License-Identifier: BSD-2-Clause-Patent -# -# A git hook to validate and correct the copyright date in source files. - -_print_githook_header "Copyright" -if [ -e .git/MERGE_HEAD ]; then - echo "Merge commit. Skipping" - exit 0 -fi - -echo "Updating copyright headers" - -regex='(^[[:blank:]]*[\*/]*.*)((Copyright[[:blank:]]*)([0-9]{4})(-([0-9]{4}))?)([[:blank:]]*(Intel.*$))' -year=$(date +%Y) -errors=0 -targets=( - # Entries with wildcard. These must be first and start with '*' or - # older versions of git will return files that were not changed. - '*.c' - '*.h' - '*.go' - '*.py' - '*.proto' - '*.java' - '*.yml' - '*.yaml' - '*.sh' - '*.bash' - '*Dockerfile*' - '*README*' - '*LICENSE*' - '*NOTICE*' - '*.txt' - '*.md' - # Entries without a wildcard - 'Makefile' - 'Jenkinsfile' - 'SConscript' - 'SConstruct' - 'copyright' - '.env' -) - -if [ -z "$files" ]; then - files=$(git diff "$TARGET" --cached --diff-filter=AM --name-only -- "${targets[@]}") -else - echo " Checking against custom files" -fi - -os=$(uname -s) - -. utils/githooks/git-version.sh - -for file in $files; do - if [[ "$file" == *vendor* ]] || [[ "$file" == *pb.go ]] || - [[ "$file" == *_string.go ]] || [[ "$file" == *pb-c* ]] || - { [ "$git_vercode" -ge 2030000 ] && - [ "$(git diff --cached -I Copyright "$file")" = '' ]; }; then - continue - fi - read -r y1 y2 <<< "$(sed -nre "s/^.*$regex.*$/\4 \6/p" "$file")" - if [[ -z $y1 ]] ; then - echo " Copyright Information not found in: $file" - errors=$((errors + 1)) - elif [[ $y1 -ne $year && $year -ne $y2 ]] ; then - git reset "$file" || (echo " Unable to un-stage $file" && exit 1) - if [ "$os" == 'Linux' ] - then - sed -i -re "s/$regex/\1Copyright $y1-$year \8/" "$file" - else - sed -i '' -re "s/$regex/\1Copyright $y1-$year \8/" "$file" - fi - - git add "$file" || (echo " Unable to re-stage $file" && exit 1) - fi -done -[[ $errors = 0 ]] || (echo " $errors errors while checking/fixing copyrights.") diff --git a/utils/githooks/pre-commit.d/10-update-copyright.sh b/utils/githooks/pre-commit.d/10-update-copyright.sh new file mode 100755 index 00000000000..b88cce8e634 --- /dev/null +++ b/utils/githooks/pre-commit.d/10-update-copyright.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Copyright 2022-2024 Intel Corporation. +# +# SPDX-License-Identifier: BSD-2-Clause-Patent +# +# A git hook to validate and correct the copyright date in source files. + +_print_githook_header "Copyright" +if [ -e .git/MERGE_HEAD ]; then + echo "Merge commit. Skipping" + exit 0 +fi + +echo "Updating copyright headers" + +utils/cq/check_update_copyright.sh "$TARGET" githook diff --git a/utils/githooks/pre-commit.d/30-Jenkinsfile b/utils/githooks/pre-commit.d/30-Jenkinsfile.sh similarity index 83% rename from utils/githooks/pre-commit.d/30-Jenkinsfile rename to utils/githooks/pre-commit.d/30-Jenkinsfile.sh index dd482c744e0..c69f08ffac5 100755 --- a/utils/githooks/pre-commit.d/30-Jenkinsfile +++ b/utils/githooks/pre-commit.d/30-Jenkinsfile.sh @@ -23,9 +23,9 @@ echo "Checking syntax" HOST="${HOST:-build.hpdd.intel.com}" CURL_VERBOSE=${CURL_VERBOSE:-""} CURL_PROXY="${CURL_PROXY:+-x }${CURL_PROXY:-}" -CURL_OPTS="$CURL_PROXY $CURL_VERBOSE -s" +CURL_OPTS=("$CURL_PROXY" "$CURL_VERBOSE" -s) URL="https://$HOST/pipeline-model-converter/validate" -if ! output=$(curl $CURL_OPTS -s -X POST -F "jenkinsfile=<${1:-Jenkinsfile}" "$URL"); then +if ! output=$(curl "${CURL_OPTS[@]}" -s -X POST -F "jenkinsfile=<${1:-Jenkinsfile}" "$URL"); then echo " Failed to access $URL. Skipping" exit 0 fi diff --git a/utils/githooks/pre-commit.d/50-clang-format b/utils/githooks/pre-commit.d/50-clang-format.sh similarity index 100% rename from utils/githooks/pre-commit.d/50-clang-format rename to utils/githooks/pre-commit.d/50-clang-format.sh diff --git a/utils/githooks/pre-commit.d/71-flake.sh b/utils/githooks/pre-commit.d/71-flake.sh index a41f28af5c2..082e0f863d8 100755 --- a/utils/githooks/pre-commit.d/71-flake.sh +++ b/utils/githooks/pre-commit.d/71-flake.sh @@ -48,12 +48,12 @@ else rc=0 # non-scons - if ! echo "$py_files" | grep -vi scons | xargs flake8 --config .flake8; then + if ! echo "$py_files" | grep -vi scons | xargs -r flake8 --config .flake8; then rc=1 fi # scons - if ! echo "$py_files" | grep -i scons | xargs flake8 --config .flake8-scons; then + if ! echo "$py_files" | grep -i scons | xargs -r flake8 --config .flake8-scons; then rc=1; fi diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 9f51c09b3be..4d58999e5c1 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1757,7 +1757,7 @@ def run_daos_cmd(conf, # pylint: disable-next=too-many-arguments def create_cont(conf, pool=None, ctype=None, label=None, path=None, oclass=None, dir_oclass=None, - file_oclass=None, hints=None, valgrind=False, log_check=True, cwd=None): + file_oclass=None, hints=None, valgrind=False, log_check=True, cwd=None, attrs=None): """Use 'daos' command to create a new container. Args: @@ -1774,6 +1774,7 @@ def create_cont(conf, pool=None, ctype=None, label=None, path=None, oclass=None, valgrind (bool, optional): Whether to run command under valgrind. Defaults to True. log_check (bool, optional): Whether to run log analysis to check for leaks. cwd (str, optional): Path to run daos command from. + attrs (dict, optional): Dictionary of user attributes to set. Returns: DaosCont: Newly created container as DaosCont object. @@ -1808,6 +1809,9 @@ def create_cont(conf, pool=None, ctype=None, label=None, path=None, oclass=None, if hints: cmd.extend(['--hints', hints]) + if attrs: + cmd.extend(['--attrs', ','.join([f"{name}:{val}" for name, val in attrs.items()])]) + def _create_cont(): """Helper function for create_cont""" rc = run_daos_cmd(conf, cmd, use_json=True, log_check=log_check, valgrind=valgrind, @@ -3064,12 +3068,12 @@ def test_uns_link(self): assert rc.returncode == 0 # Create a second new container which is not linked - container2 = create_cont(self.conf, self.pool, ctype="POSIX", label='mycont_uns_link2') cont_attrs = {'dfuse-attr-time': '5m', 'dfuse-dentry-time': '5m', 'dfuse-dentry-dir-time': '5m', 'dfuse-ndentry-time': '5m'} - container2.set_attrs(cont_attrs) + container2 = create_cont(self.conf, self.pool, ctype="POSIX", label='mycont_uns_link2', + attrs=cont_attrs) # Link and then destroy the first container path = join(self.dfuse.dir, 'uns_link1') @@ -6204,7 +6208,15 @@ def get_cmd(cont_id): '--type', 'POSIX', '--path', - join(dfuse.dir, f'container_{cont_id}')] + join(dfuse.dir, f'container_{cont_id}'), + '--attrs', + ','.join([ + 'dfuse-attr-time:5m', + 'dfuse-dentry-time:4m', + 'dfuse-dentry-dir-time:3m', + 'dfuse-ndentry-time:2m', + 'dfuse-data-cache:off', + 'dfuse-direct-io-disable:off'])] test_cmd = AllocFailTest(conf, 'cont-create', get_cmd) test_cmd.check_post_stdout = False