Skip to content

Commit

Permalink
Merge branch 'release/2.6' into mjean/DAOS-16167-1
Browse files Browse the repository at this point in the history
  • Loading branch information
mjean308 committed Aug 28, 2024
2 parents 2094da0 + 0099aa4 commit 0832e1c
Show file tree
Hide file tree
Showing 54 changed files with 950 additions and 597 deletions.
2 changes: 1 addition & 1 deletion ci/provisioning/post_provision_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ source ci/provisioning/post_provision_config_common_functions.sh
source ci/junit.sh


: "${MLNX_VER_NUM:=latest-5.8}"
: "${MLNX_VER_NUM:=24.04-0.6.6.0}"

: "${DISTRO:=EL_7}"
DSL_REPO_var="DAOS_STACK_${DISTRO}_LOCAL_REPO"
Expand Down
14 changes: 12 additions & 2 deletions docs/admin/administration.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,20 @@ severity, message, description, and cause.

|Event|Event type|Severity|Message|Description|Cause|
|:----|:----|:----|:----|:----|:----|
| device\_set\_faulty| INFO\_ONLY| NOTICE or ERROR| Device: <uuid\> set faulty / Device: <uuid\> set faulty failed: <rc\> / Device: <uuid\> auto faulty detect / Device: <uuid\> auto faulty detect failed: <rc\> | Indicates that a device has either been explicitly automatically set as faulty. Device UUID specified in event data. | Either DMG set nvme-faulty command was used to explicitly set device as faulty or an error threshold was reached on a device which has triggered an auto faulty reaction. |
| device\_media\_error| INFO\_ONLY| ERROR| Device: <uuid\> <error-type\> error logged from tgt\_id:<idx\> | Indicates that a device media error has been detected for a specific target. The error type could be unmap, write, read or checksum (csum). Device UUID and target ID specified in event data. | Media error occurred on backing device. |
| device\_unplugged| INFO\_ONLY| NOTICE| Device: <uuid\> unplugged | Indicates device was physically removed from host. | NVMe SSD physically removed from host. |
| device\_plugged| INFO\_ONLY| NOTICE| Detected hot plugged device: <bdev-name\> | Indicates device was physically inserted into host. | NVMe SSD physically added to host. |
| device\_replace| INFO\_ONLY| NOTICE or ERROR| Replaced device: <uuid\> with device: <uuid\> [failed: <rc\>] | Indicates that a faulty device was replaced with a new device and if the operation failed. The old and new device IDs as well as any non-zero return code are specified in the event data. | Device was replaced using DMG nvme replace command. |
| device\_link\_speed\_changed| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link speed changed to <transfer-rate\> (max <transfer-rate\>)| Indicates that an NVMe device link speed has changed. The negotiated and maximum device link speeds are indicated in the event message field and the severity is set to warning if the negotiated speed is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link speed was previously downgraded and has returned to maximum or link speed has downgraded to a value that is less than its maximum capability.|
| device\_link\_width\_changed| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link width changed to <pcie-link-lanes\> (max <pcie-link-lanes\>)| Indicates that an NVMe device link width has changed. The negotiated and maximum device link widths are indicated in the event message field and the severity is set to warning if the negotiated width is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link width was previously downgraded and has returned to maximum or link width has downgraded to a value that is less than its maximum capability.|
| engine\_format\_required|INFO\_ONLY|NOTICE|DAOS engine <idx\> requires a <type\> format|Indicates engine is waiting for allocated storage to be formatted on formatted on instance <idx\> with dmg tool. <type\> can be either SCM or Metadata.|DAOS server attempts to bring-up an engine that has unformatted storage.|
| engine\_died| STATE\_CHANGE| ERROR| DAOS engine <idx\> exited exited unexpectedly: <error\> | Indicates engine instance <idx\> unexpectedly. <error> describes the exit state returned from exited daos\_engine process.| N/A |
| engine\_asserted| STATE\_CHANGE| ERROR| TBD| Indicates engine instance <idx> threw a runtime assertion, causing a crash. | An unexpected internal state resulted in assert failure. |
| engine\_asserted| STATE\_CHANGE| ERROR| TBD| Indicates engine instance <idx\> threw a runtime assertion, causing a crash. | An unexpected internal state resulted in assert failure. |
| engine\_clock\_drift| INFO\_ONLY | ERROR| clock drift detected| Indicates CART comms layer has detected clock skew between engines.| NTP may not be syncing clocks across DAOS system. |
| engine\_join\_failed| INFO\_ONLY| ERROR | DAOS engine <idx\> (rank <rank\>) was not allowed to join the system | Join operation failed for the given engine instance ID and rank (if assigned). | Reason should be provided in the extended info field of the event data. |
| pool\_corruption\_detected| INFO\_ONLY| ERROR | Data corruption detected| Indicates a corruption in pool data has been detected. The event fields will contain pool and container UUIDs. | A corruption was found by the checksum scrubber. |
| pool\_destroy\_deferred| INFO\_ONLY| WARNING | pool:<uuid\> destroy is deferred| Indicates a destroy operation has been deferre. | Pool destroy in progress but not complete. |
| pool\_rebuild\_started| INFO\_ONLY| NOTICE | Pool rebuild started.| Indicates a pool rebuild has started. The event data field contains pool map version and pool operation identifier. | When a pool rank becomes unavailable a rebuild will be triggered. |
| pool\_rebuild\_finished| INFO\_ONLY| NOTICE| Pool rebuild finished.| Indicates a pool rebuild has finished successfully. The event data field includes the pool map version and pool operation identifier. | N/A|
| pool\_rebuild\_failed| INFO\_ONLY| ERROR| Pool rebuild failed: <rc\>.| Indicates a pool rebuild has failed. The event data field includes the pool map version and pool operation identifier. <rc\> provides a string representation of DER code.| N/A |
Expand All @@ -59,7 +69,7 @@ severity, message, description, and cause.
| swim\_rank\_dead| STATE\_CHANGE| NOTICE| SWIM rank marked as dead.| The SWIM protocol has detected the specified rank is unresponsive.| A remote DAOS engine has become unresponsive.|
| system\_start\_failed| INFO\_ONLY| ERROR| System startup failed, <errors\>| Indicates that a user initiated controlled startup failed. <errors\> shows which ranks failed.| Ranks failed to start.|
| system\_stop\_failed| INFO\_ONLY| ERROR| System shutdown failed during <action\> action, <errors\> | Indicates that a user initiated controlled shutdown failed. <action\> identifies the failing shutdown action and <errors\> shows which ranks failed.| Ranks failed to stop.|

| system\_fabric\_provider\_changed| NOTICE| System fabric provider has changed: <old-provider\> -> <new-provider\>| Indicates that the system-wide fabric provider has been updated. No other specific information is included in event data.| A system-wide fabric provider change has been intentionally applied to all joined ranks.|

## System Logging

Expand Down
29 changes: 13 additions & 16 deletions src/bio/bio_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,15 +221,14 @@ bio_dev_set_faulty(struct bio_xs_context *xs, uuid_t dev_uuid)
rc = dss_abterr2der(rc);

if (rc == 0)
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO,
RAS_SEV_NOTICE, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
"Dev: "DF_UUID" set faulty\n", DP_UUID(dev_uuid));
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO, RAS_SEV_NOTICE, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL,
"Device: " DF_UUID " set faulty\n", DP_UUID(dev_uuid));
else
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO,
RAS_SEV_ERROR, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
"Dev: "DF_UUID" set faulty failed: %d\n", DP_UUID(dev_uuid), rc);
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO, RAS_SEV_ERROR, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL,
"Device: " DF_UUID " set faulty failed: %d\n", DP_UUID(dev_uuid),
rc);
return rc;
}

Expand Down Expand Up @@ -779,16 +778,14 @@ auto_faulty_detect(struct bio_blobstore *bbs)
}

if (rc == 0)
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO,
RAS_SEV_NOTICE, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
"Dev: "DF_UUID" auto faulty detect\n",
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO, RAS_SEV_NOTICE, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL,
"Device: " DF_UUID " auto faulty detect\n",
DP_UUID(bbs->bb_dev->bb_uuid));
else
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO,
RAS_SEV_ERROR, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
"Dev: "DF_UUID" auto faulty detect failed: %d\n",
ras_notify_eventf(RAS_DEVICE_SET_FAULTY, RAS_TYPE_INFO, RAS_SEV_ERROR, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL,
"Device: " DF_UUID " auto faulty detect failed: %d\n",
DP_UUID(bbs->bb_dev->bb_uuid), rc);
}

Expand Down
5 changes: 2 additions & 3 deletions src/bio/bio_xstream.c
Original file line number Diff line number Diff line change
Expand Up @@ -744,9 +744,8 @@ bio_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
D_ASSERT(d_bdev->bb_desc != NULL);
d_bdev->bb_removed = 1;

ras_notify_eventf(RAS_DEVICE_UNPLUGGED, RAS_TYPE_INFO,
RAS_SEV_NOTICE, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, "Dev: "DF_UUID" unplugged\n",
ras_notify_eventf(RAS_DEVICE_UNPLUGGED, RAS_TYPE_INFO, RAS_SEV_NOTICE, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, "Device: " DF_UUID " unplugged\n",
DP_UUID(d_bdev->bb_uuid));

/* The bio_bdev is still under construction */
Expand Down
142 changes: 49 additions & 93 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,97 +14,52 @@
* List of supported CaRT providers. The table is terminated with the last entry
* having nad_str = NULL.
*/
struct crt_na_dict crt_na_dict[] = {
{
.nad_type = CRT_PROV_SM,
.nad_str = "sm",
.nad_contig_eps = false,
.nad_port_bind = false,
}, {
.nad_type = CRT_PROV_OFI_VERBS_RXM,
.nad_str = "ofi+verbs;ofi_rxm",
.nad_alt_str = "ofi+verbs",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_OFI_TCP,
.nad_str = "ofi+tcp",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_OFI_TCP_RXM,
.nad_str = "ofi+tcp;ofi_rxm",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_OFI_CXI,
.nad_str = "ofi+cxi",
.nad_contig_eps = true,
.nad_port_bind = false,
}, {
.nad_type = CRT_PROV_OFI_OPX,
.nad_str = "ofi+opx",
.nad_contig_eps = false,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC,
.nad_str = "ucx+rc_v",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_UD,
.nad_str = "ucx+ud_v",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC_UD,
.nad_str = "ucx+rc_v,ud_v",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC_O,
.nad_str = "ucx+rc",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_UD_O,
.nad_str = "ucx+ud",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC_UD_O,
.nad_str = "ucx+rc,ud",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC_X,
.nad_str = "ucx+rc_x",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_UD_X,
.nad_str = "ucx+ud_x",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_RC_UD_X,
.nad_str = "ucx+rc_x,ud_x",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_DC_X,
.nad_str = "ucx+dc_x",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_type = CRT_PROV_UCX_TCP,
.nad_str = "ucx+tcp",
.nad_contig_eps = true,
.nad_port_bind = true,
}, {
.nad_str = NULL,
}
};
struct crt_na_dict crt_na_dict[] = {{
.nad_type = CRT_PROV_SM,
.nad_str = "sm",
.nad_contig_eps = false,
.nad_port_bind = false,
},
{
.nad_type = CRT_PROV_OFI_VERBS_RXM,
.nad_str = "ofi+verbs;ofi_rxm",
.nad_alt_str = "ofi+verbs",
.nad_contig_eps = true,
.nad_port_bind = true,
},
{
.nad_type = CRT_PROV_OFI_TCP,
.nad_str = "ofi+tcp",
.nad_contig_eps = true,
.nad_port_bind = true,
},
{
.nad_type = CRT_PROV_OFI_TCP_RXM,
.nad_str = "ofi+tcp;ofi_rxm",
.nad_contig_eps = true,
.nad_port_bind = true,
},
{
.nad_type = CRT_PROV_OFI_CXI,
.nad_str = "ofi+cxi",
.nad_contig_eps = true,
.nad_port_bind = false,
},
{
.nad_type = CRT_PROV_OFI_OPX,
.nad_str = "ofi+opx",
.nad_contig_eps = false,
.nad_port_bind = true,
},
{
.nad_type = CRT_PROV_UCX,
.nad_str = "ucx+ud_x",
.nad_contig_eps = true,
.nad_port_bind = true,
},
{
.nad_str = NULL,
}};

int
crt_hg_parse_uri(const char *uri, crt_provider_t *prov, char *addr)
Expand Down Expand Up @@ -717,6 +672,8 @@ crt_get_info_string(bool primary, crt_provider_t provider, int iface_idx,
start_port = crt_provider_ctx0_port_get(primary, provider);
domain_str = crt_provider_domain_str_get(primary, provider, iface_idx);

D_ASSERTF(provider_str != NULL, "String for provider=%d not found\n", provider);

/* CXI provider uses domain names for info string */
if (provider == CRT_PROV_OFI_CXI)
iface_str = NULL;
Expand All @@ -735,8 +692,7 @@ crt_get_info_string(bool primary, crt_provider_t provider, int iface_idx,
D_GOTO(out, rc);
}

if (provider_str)
size += strlen(provider_str);
size = strlen(provider_str);
if (domain_str)
size += strlen(domain_str);
if (iface_str)
Expand Down
25 changes: 9 additions & 16 deletions src/cart/crt_hg.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#define CRT_HG_POST_INCR (512)
#define CRT_HG_MRECV_BUF (16)

#define CRT_UCX_STR "ucx"

struct crt_rpc_priv;
struct crt_common_hdr;
struct crt_corpc_hdr;
Expand All @@ -40,27 +42,17 @@ struct crt_corpc_hdr;
* Enumeration specifying providers supported by the library
*/
typedef enum {
CRT_PROV_SM = 0,
CRT_PROV_SM = 0,
CRT_PROV_OFI_SOCKETS,
CRT_PROV_OFI_VERBS_RXM,
CRT_PROV_OFI_GNI,
CRT_PROV_OFI_TCP,
CRT_PROV_OFI_TCP_RXM,
CRT_PROV_OFI_CXI,
CRT_PROV_OFI_OPX,
CRT_PROV_OFI_LAST = CRT_PROV_OFI_OPX,
CRT_PROV_UCX_RC,
CRT_PROV_UCX_UD,
CRT_PROV_UCX_RC_UD,
CRT_PROV_UCX_RC_O,
CRT_PROV_UCX_UD_O,
CRT_PROV_UCX_RC_UD_O,
CRT_PROV_UCX_RC_X,
CRT_PROV_UCX_UD_X,
CRT_PROV_UCX_RC_UD_X,
CRT_PROV_UCX_DC_X,
CRT_PROV_UCX_TCP,
CRT_PROV_UCX_LAST = CRT_PROV_UCX_TCP,
CRT_PROV_OFI_LAST = CRT_PROV_OFI_OPX,
CRT_PROV_UCX,
CRT_PROV_UCX_LAST = CRT_PROV_UCX,
/* Note: This entry should be the last valid one in enum */
CRT_PROV_COUNT,
CRT_PROV_UNKNOWN = -1,
Expand All @@ -75,8 +67,7 @@ crt_hg_parse_uri(const char *uri, crt_provider_t *prov, char *addr);
static inline bool
crt_provider_is_ucx(crt_provider_t prov)
{
return (prov >= CRT_PROV_UCX_RC) &&
(prov <= CRT_PROV_UCX_LAST);
return (prov >= CRT_PROV_UCX) && (prov <= CRT_PROV_UCX_LAST);
}

static inline bool
Expand All @@ -96,6 +87,8 @@ struct crt_na_dict {
bool nad_port_bind;
/** a flag to indicate if endpoints are contiguous */
bool nad_contig_eps;
/** a flag to indicate if nad_str is allocated on the heap */
bool nad_str_alloc;
};

extern struct crt_na_dict crt_na_dict[];
Expand Down
23 changes: 21 additions & 2 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -444,13 +444,13 @@ crt_provider_t
crt_str_to_provider(const char *str_provider)
{
crt_provider_t prov = CRT_PROV_UNKNOWN;
int i;
int i, len;
char *p = NULL;

if (str_provider == NULL)
return prov;

for (i = 0; crt_na_dict[i].nad_str != NULL; i++) {

if (!strncmp(str_provider, crt_na_dict[i].nad_str,
strlen(crt_na_dict[i].nad_str) + 1) ||
(crt_na_dict[i].nad_alt_str &&
Expand All @@ -459,6 +459,21 @@ crt_str_to_provider(const char *str_provider)
prov = crt_na_dict[i].nad_type;
break;
}
if (crt_na_dict[i].nad_type == CRT_PROV_UCX &&
!strncmp(str_provider, CRT_UCX_STR, strlen(CRT_UCX_STR))) {
len = strlen(str_provider);
if (len > strlen(CRT_UCX_STR) && strchr(str_provider, '+')) {
D_STRNDUP(p, str_provider, len);
if (!p) {
return prov;
} else {
crt_na_dict[i].nad_str = p;
crt_na_dict[i].nad_str_alloc = true;
}
}
prov = crt_na_dict[i].nad_type;
break;
}
}

return prov;
Expand Down Expand Up @@ -964,6 +979,10 @@ crt_finalize(void)
crt_na_config_fini(false, crt_gdata.cg_secondary_provs[i]);
}

for (i = 0; crt_na_dict[i].nad_str != NULL; i++)
if (crt_na_dict[i].nad_str_alloc)
D_FREE(crt_na_dict[i].nad_str);

D_FREE(crt_gdata.cg_secondary_provs);
D_FREE(crt_gdata.cg_prov_gdata_secondary);
} else {
Expand Down
7 changes: 0 additions & 7 deletions src/client/dfuse/pil4dfs/int_dfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -940,13 +940,6 @@ child_hdlr(void)
DL_WARN(rc, "daos_eq_lib_init() failed in child process");
daos_dti_reset();
td_eqh = main_eqh = DAOS_HDL_INVAL;
if (d_eq_count_max > 0) {
rc = daos_eq_create(&td_eqh);
if (rc)
DL_WARN(rc, "daos_eq_create() failed");
else
main_eqh = td_eqh;
}
context_reset = true;
}

Expand Down
4 changes: 3 additions & 1 deletion src/control/events/ras.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2022 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -55,6 +55,8 @@ const (
RASSystemStopFailed RASID = C.RAS_SYSTEM_STOP_FAILED // error
RASEngineJoinFailed RASID = C.RAS_ENGINE_JOIN_FAILED // error
RASSystemFabricProvChanged RASID = C.RAS_SYSTEM_FABRIC_PROV_CHANGED // info
RASNVMeLinkSpeedChanged RASID = C.RAS_DEVICE_LINK_SPEED_CHANGED // warning|notice
RASNVMeLinkWidthChanged RASID = C.RAS_DEVICE_LINK_WIDTH_CHANGED // warning|notice
)

func (id RASID) String() string {
Expand Down
Loading

0 comments on commit 0832e1c

Please sign in to comment.