diff --git a/.clang-format b/.clang-format index 631567895400..e5d9bddc4995 100644 --- a/.clang-format +++ b/.clang-format @@ -14,6 +14,7 @@ IndentCaseLabels: false ForEachMacros: ['d_list_for_each_entry', 'd_list_for_each_safe', 'd_list_for_each_entry_safe', + 'd_list_for_each_entry_reverse', 'evt_ent_array_for_each'] PointerAlignment: Right AlignTrailingComments: true diff --git a/debian/changelog b/debian/changelog index c9f8c2e6aa60..e966d2be46c5 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,15 @@ +daos (2.5.100-15) unstable; urgency=medium + [ Ashley M. Pittman ] + * Updated pydaos install process + + -- Ashley M. Pittman Fri, 02 Feb 2024 09:15:00 -0800 + +daos (2.5.100-14) unstable; urgency=medium + [ Brian J. Murrell ] + * NOOP change to keep in parity with RPM version + + -- Brian J. Murrell Tue, 09 Jan 2024 13:59:01 -0500 + daos (2.5.100-13) unstable; urgency=medium [ Brian J. Murrell ] * Update for EL 8.8 and Leap 15.5 diff --git a/docs/admin/administration.md b/docs/admin/administration.md index a65ef4a85ce4..7e8ed1abef9d 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -478,6 +478,59 @@ boro-11 ``` #### Exclusion and Hotplug +- Automatic exclusion of an NVMe SSD: + +Automatic exclusion based on faulty criteria is the default behavior in DAOS +release 2.6. The default criteria parameters are `max_io_errs: 10` and +`max_csum_errs: ` (essentially eviction due to checksum errors is +disabled by default). + +Setting auto-faulty criteria parameters can be done through the server config +file by adding the following YAML to the engine section of the server config +file. + +```yaml +engines: +- bdev_auto_faulty: + enable: true + max_io_errs: 1 + max_csum_errs: 2 +``` + +On formatting the storage for the engine, these settings result in the +following `daos_server` log entries to indicate the parameters are written to +the engine's NVMe config: + +```bash +DEBUG 13:59:29.229795 provider.go:592: BdevWriteConfigRequest: &{ForwardableRequest:{Forwarded:false} ConfigOutputPath:/mnt/daos0/daos_nvme.conf OwnerUID:10695475 OwnerGID:10695475 TierProps:[{Class:nvme DeviceList:0000:5e:00.0 DeviceFileSize:0 Tier:1 DeviceRoles:{OptionBits:0}}] HotplugEnabled:false HotplugBusidBegin:0 HotplugBusidEnd:0 Hostname:wolf-310.wolf.hpdd.intel.com AccelProps:{Engine: Options:0} SpdkRpcSrvProps:{Enable:false SockAddr:} AutoFaultyProps:{Enable:true MaxIoErrs:1 MaxCsumErrs:2} VMDEnabled:false ScannedBdevs:} +Writing NVMe config file for engine instance 0 to "/mnt/daos0/daos_nvme.conf" +``` + +The engine's NVMe config (produced during format) then contains the following +JSON to apply the criteria: + +```json +[tanabarr@wolf-310 ~]$ cat /mnt/daos0/daos_nvme.conf +{ + "daos_data": { + "config": [ + { + "params": { + "enable": true, + "max_io_errs": 1, + "max_csum_errs": 2 + }, + "method": "auto_faulty" + ... +``` + +These engine logfile entries indicate that the settings have been read and +applied: + +```bash +01/12-13:59:41.36 wolf-310 DAOS[1299350/-1/0] bio INFO src/bio/bio_config.c:1016 bio_read_auto_faulty_criteria() NVMe auto faulty is enabled. Criteria: max_io_errs:1, max_csum_errs:2 +``` + - Manually exclude an NVMe SSD: ```bash $ dmg storage set nvme-faulty --help @@ -491,7 +544,7 @@ Usage: -f, --force Do not require confirmation ``` -To manually evict an NVMe SSD (auto eviction will be supported in a future release), +To manually evict an NVMe SSD (auto eviction is covered later in this section), the device state needs to be set faulty by running the following command: ```bash $ dmg -l boro-11 storage set nvme-faulty --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index bda3dd2aaf6e..796ae85995ed 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -123,7 +123,7 @@ Application Options: --allow-proxy Allow proxy configuration via environment -o, --config= Server config file path -b, --debug Enable debug output - -j, --json enable JSON output + -j, --json Enable JSON output -J, --json-logging Enable JSON-formatted log output --syslog Enable logging to syslog diff --git a/src/bio/README.md b/src/bio/README.md index 44364ba19a72..2439352eace0 100644 --- a/src/bio/README.md +++ b/src/bio/README.md @@ -81,7 +81,7 @@ While monitoring this health data, an admin can now make the determination to ma ## Faulty Device Detection (SSD Eviction) -Faulty device detection and reaction can be referred to as NVMe SSD eviction. This involves all affected pool targets being marked as down and the rebuild of all affected pool targets being automatically triggered. A persistent device state is maintained in SMD and the device state is updated from NORMAL to FAULTY upon SSD eviction. The faulty device reaction will involve various SPDK cleanup, including all I/O channels released, SPDK allocations (termed 'blobs') closed, and the SPDK blobstore created on the NVMe SSD unloaded. Currently only manual SSD eviction is supported, and a future release will support automatic SSD eviction. +Faulty device detection and reaction can be referred to as NVMe SSD eviction. This involves all affected pool targets being marked as down and the rebuild of all affected pool targets being automatically triggered. A persistent device state is maintained in SMD and the device state is updated from NORMAL to FAULTY upon SSD eviction. The faulty device reaction involves various SPDK cleanup, including all I/O channels released, SPDK allocations (termed 'blobs') closed, and the SPDK blobstore created on the NVMe SSD unloaded. Automatic SSD eviction is enabled by default and can be disabled using the `bdev_auto_faulty` server config file engine parameter. Useful admin commands to manually evict an NVMe SSD: - dmg storage set nvme-faulty [used to manually set an NVMe SSD to FAULTY (ie evict the device)] @@ -89,7 +89,9 @@ Faulty device detection and reaction can be referred to as NVMe SSD eviction. Th ## NVMe SSD Hot Plug -**Full NVMe hot plug capability will be available and supported in DAOS 2.0 release. Use is currently intended for testing only and is not supported for production.** +NVMe hot plug with Intel VMD devices is supported in this release. + +**Full hot plug capability when using non-Intel-VMD devices is to be supported in DAOS 2.8 release. Use is currently intended for testing only and is not supported for production.** The NVMe hot plug feature includes device removal (an NVMe hot remove event) and device reintegration (an NVMe hotplug event) when a faulty device is replaced with a new device. @@ -97,8 +99,6 @@ For device removal, if the device is a faulty or previously evicted device, then For device reintegration, if a new device is plugged to replace a faulty device, the admin would need to issue a device replacement command. All SPDK in-memory stubs would be created and all affected pool targets automatically reintegrated on the new device. The device state would be displayed as NEW initially and NORMAL after the replacement event occurred. If a faulty device or previously evicted device is re-plugged, the device will remain evicted, and the device state would display EVICTED. If a faulty device is desired to be reused (NOTE: this is not advised, mainly used for testing purposes), the admin can run the same device replacement command setting the new and old device IDs to be the same device ID. Reintegration will not occur on the device, as DAOS does not currently support incremental reintegration. -NVMe hot plug with Intel VMD devices is currently not supported in this release, but will be supported in a future release. - Useful admin commands to replace an evicted device: - dmg storage replace nvme [used to replace an evicted device with a new device] - dmg storage replace nvme [used to bring an evicted device back online (without reintegration)] diff --git a/src/bio/bio_config.c b/src/bio/bio_config.c index e9c546cba4c0..f1c8d3550f9c 100644 --- a/src/bio/bio_config.c +++ b/src/bio/bio_config.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2021-2023 Intel Corporation. + * (C) Copyright 2021-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -117,9 +117,6 @@ struct busid_range_info { uint8_t end; }; -/* PCI address bus-ID range to be used to filter hotplug events */ -struct busid_range_info hotplug_busid_range = {}; - static struct spdk_json_object_decoder busid_range_decoders[] = { {"begin", offsetof(struct busid_range_info, begin), spdk_json_decode_uint8}, @@ -131,9 +128,6 @@ struct accel_props_info { uint16_t opt_mask; }; -/* Acceleration properties to specify engine to use and optional capabilities to enable */ -struct accel_props_info accel_props = {}; - static struct spdk_json_object_decoder accel_props_decoders[] = { {"accel_engine", offsetof(struct accel_props_info, engine), spdk_json_decode_string}, @@ -145,15 +139,24 @@ struct rpc_srv_info { char *sock_addr; }; -/* Settings to enable an SPDK JSON-RPC server to run in current process */ -struct rpc_srv_info rpc_srv_settings = {}; - static struct spdk_json_object_decoder rpc_srv_decoders[] = { {"enable", offsetof(struct rpc_srv_info, enable), spdk_json_decode_bool}, {"sock_addr", offsetof(struct rpc_srv_info, sock_addr), spdk_json_decode_string}, }; +struct auto_faulty_info { + bool enable; + uint32_t max_io_errs; + uint32_t max_csum_errs; +}; + +static struct spdk_json_object_decoder auto_faulty_decoders[] = { + {"enable", offsetof(struct auto_faulty_info, enable), spdk_json_decode_bool}, + {"max_io_errs", offsetof(struct auto_faulty_info, max_io_errs), spdk_json_decode_uint32}, + {"max_csum_errs", offsetof(struct auto_faulty_info, max_csum_errs), spdk_json_decode_uint32}, +}; + static int is_addr_in_allowlist(char *pci_addr, const struct spdk_pci_addr *allowlist, int num_allowlist_devices) @@ -419,7 +422,6 @@ add_traddrs_from_bdev_subsys(struct json_config_ctx *ctx, bool vmd_enabled, } if (strcmp(cfg.method, NVME_CONF_ATTACH_CONTROLLER) != 0) { - D_DEBUG(DB_MGMT, "skip config entry %s\n", cfg.method); goto free_method; } @@ -479,6 +481,7 @@ add_traddrs_from_bdev_subsys(struct json_config_ctx *ctx, bool vmd_enabled, free_method: D_FREE(cfg.method); + /* Decode functions return positive RC for success or not-found */ if (rc > 0) rc = 0; return rc; @@ -506,7 +509,6 @@ check_name_from_bdev_subsys(struct json_config_ctx *ctx) if (strcmp(cfg.method, NVME_CONF_ATTACH_CONTROLLER) != 0 && strcmp(cfg.method, NVME_CONF_AIO_CREATE) != 0) { - D_DEBUG(DB_MGMT, "skip config entry %s\n", cfg.method); goto free_method; } @@ -750,7 +752,7 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e if (rc != 0) D_GOTO(out, rc); - /* Capture daos object */ + /* Capture daos_data JSON object */ rc = spdk_json_find(ctx->values, "daos_data", NULL, &daos_data, SPDK_JSON_VAL_OBJECT_BEGIN); if (rc < 0) { @@ -769,8 +771,8 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e /* Get 'config' array first configuration entry */ ctx->config_it = spdk_json_array_first(ctx->config); if (ctx->config_it == NULL) { - D_DEBUG(DB_MGMT, "Empty 'daos_data' section\n"); - D_GOTO(out, rc = 1); /* non-fatal */ + /* Entry not-found so return positive RC */ + D_GOTO(out, rc = 1); } while (ctx->config_it != NULL) { @@ -789,14 +791,16 @@ decode_daos_data(const char *nvme_conf, const char *method_name, struct config_e } if (ctx->config_it == NULL) { - D_DEBUG(DB_MGMT, "No '%s' entry\n", method_name); - rc = 1; /* non-fatal */ + /* Entry not-found so return positive RC */ + rc = 1; } out: free_json_config_ctx(ctx); return rc; } +struct busid_range_info hotplug_busid_range = {}; + static int get_hotplug_busid_range(const char *nvme_conf) { @@ -816,11 +820,12 @@ get_hotplug_busid_range(const char *nvme_conf) D_GOTO(out, rc = -DER_INVAL); } - D_DEBUG(DB_MGMT, "'%s' read from config: %X-%X\n", NVME_CONF_SET_HOTPLUG_RANGE, - hotplug_busid_range.begin, hotplug_busid_range.end); + D_INFO("'%s' read from config: %X-%X\n", NVME_CONF_SET_HOTPLUG_RANGE, + hotplug_busid_range.begin, hotplug_busid_range.end); out: if (cfg.method != NULL) D_FREE(cfg.method); + /* Decode functions return positive RC for success or not-found */ if (rc > 0) rc = 0; return 0; @@ -846,6 +851,7 @@ hotplug_filter_fn(const struct spdk_pci_addr *addr) /** * Set hotplug bus-ID ranges in SPDK filter based on values read from JSON config file. + * The PCI bus-ID ranges will be used to filter hotplug events. * * \param[in] nvme_conf JSON config file path * @@ -856,6 +862,8 @@ bio_set_hotplug_filter(const char *nvme_conf) { int rc; + D_ASSERT(nvme_conf != NULL); + rc = get_hotplug_busid_range(nvme_conf); if (rc != 0) return rc; @@ -866,7 +874,8 @@ bio_set_hotplug_filter(const char *nvme_conf) } /** - * Read optional acceleration properties from JSON config file. + * Read acceleration properties from JSON config file to specify which acceleration engine to use + * and selections of optional capabilities to enable. * * \param[in] nvme_conf JSON config file path * @@ -876,8 +885,11 @@ int bio_read_accel_props(const char *nvme_conf) { struct config_entry cfg = {}; + struct accel_props_info accel_props = {}; int rc; + D_ASSERT(nvme_conf != NULL); + rc = decode_daos_data(nvme_conf, NVME_CONF_SET_ACCEL_PROPS, &cfg); if (rc != 0) goto out; @@ -891,7 +903,7 @@ bio_read_accel_props(const char *nvme_conf) D_GOTO(out, rc = -DER_INVAL); } - D_DEBUG(DB_MGMT, "'%s' read from config, setting: %s, capabilities: move=%s,crc=%s\n", + D_INFO("'%s' read from config, setting: %s, capabilities: move=%s,crc=%s\n", NVME_CONF_SET_ACCEL_PROPS, accel_props.engine, CHK_FLAG(accel_props.opt_mask, NVME_ACCEL_FLAG_MOVE) ? "true" : "false", CHK_FLAG(accel_props.opt_mask, NVME_ACCEL_FLAG_CRC) ? "true" : "false"); @@ -900,13 +912,16 @@ bio_read_accel_props(const char *nvme_conf) out: if (cfg.method != NULL) D_FREE(cfg.method); + /* Decode functions return positive RC for success or not-found */ if (rc > 0) rc = 0; return rc; } /** - * Set output parameters based on JSON config settings for option SPDK JSON-RPC server. + * Retrieve JSON config settings for option SPDK JSON-RPC server. Read flag to indicate whether to + * enable the SPDK JSON-RPC server and the socket file address from the JSON config used to + * initialize SPDK subsystems. * * \param[in] nvme_conf JSON config file path * \param[out] enable Flag to enable the RPC server @@ -918,14 +933,19 @@ int bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock_addr) { struct config_entry cfg = {}; + struct rpc_srv_info rpc_srv_settings = {}; int rc; + D_ASSERT(nvme_conf != NULL); + D_ASSERT(enable != NULL); + D_ASSERT(sock_addr != NULL); + D_ASSERT(*sock_addr == NULL); + rc = decode_daos_data(nvme_conf, NVME_CONF_SET_SPDK_RPC_SERVER, &cfg); if (rc != 0) goto out; - rc = spdk_json_decode_object(cfg.params, rpc_srv_decoders, - SPDK_COUNTOF(rpc_srv_decoders), + rc = spdk_json_decode_object(cfg.params, rpc_srv_decoders, SPDK_COUNTOF(rpc_srv_decoders), &rpc_srv_settings); if (rc < 0) { D_ERROR("Failed to decode '%s' entry: %s)\n", NVME_CONF_SET_SPDK_RPC_SERVER, @@ -936,11 +956,68 @@ bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock *enable = rpc_srv_settings.enable; *sock_addr = rpc_srv_settings.sock_addr; - D_DEBUG(DB_MGMT, "'%s' read from config: enabled=%d, addr %s\n", - NVME_CONF_SET_SPDK_RPC_SERVER, *enable, (char *)*sock_addr); + D_INFO("'%s' read from config: enabled=%d, addr %s\n", NVME_CONF_SET_SPDK_RPC_SERVER, + *enable, (char *)*sock_addr); out: if (cfg.method != NULL) D_FREE(cfg.method); + /* Decode functions return positive RC for success or not-found */ + if (rc > 0) + rc = 0; + return rc; +} + +/** + * Set output parameters based on JSON config settings for NVMe auto-faulty feature and threshold + * criteria. + * + * \param[in] nvme_conf JSON config file path + * \param[out] enable Flag to enable the auto-faulty feature + * \param[out] max_io_errs Max IO errors (threshold) before marking as faulty + * \param[out] max_csum_errs Max checksum errors (threshold) before marking as faulty + * + * \returns Zero on success, negative on failure (DER) + */ +int +bio_read_auto_faulty_criteria(const char *nvme_conf, bool *enable, uint32_t *max_io_errs, + uint32_t *max_csum_errs) +{ + struct config_entry cfg = {}; + struct auto_faulty_info auto_faulty_criteria = {}; + int rc; + + rc = decode_daos_data(nvme_conf, NVME_CONF_SET_AUTO_FAULTY, &cfg); + if (rc != 0) + goto out; + + rc = spdk_json_decode_object(cfg.params, auto_faulty_decoders, + SPDK_COUNTOF(auto_faulty_decoders), &auto_faulty_criteria); + if (rc < 0) { + D_ERROR("Failed to decode '%s' entry: %s)\n", NVME_CONF_SET_AUTO_FAULTY, + spdk_strerror(-rc)); + D_GOTO(out, rc = -DER_INVAL); + } + + *enable = auto_faulty_criteria.enable; + if (*enable == false) { + *max_io_errs = UINT32_MAX; + *max_csum_errs = UINT32_MAX; + goto out; + } + *max_io_errs = auto_faulty_criteria.max_io_errs; + if (*max_io_errs == 0) + *max_io_errs = UINT32_MAX; + *max_csum_errs = auto_faulty_criteria.max_csum_errs; + if (*max_csum_errs == 0) + *max_csum_errs = UINT32_MAX; + +out: + D_INFO("NVMe auto faulty is %s. Criteria: max_io_errs:%u, max_csum_errs:%u\n", + *enable ? "enabled" : "disabled", *max_io_errs, *max_csum_errs); + + if (cfg.method != NULL) + D_FREE(cfg.method); + /* Decode functions return positive RC for success or not-found */ if (rc > 0) rc = 0; return rc; diff --git a/src/bio/bio_context.c b/src/bio/bio_context.c index 39c2d70d178d..365e35953e65 100644 --- a/src/bio/bio_context.c +++ b/src/bio/bio_context.c @@ -587,6 +587,7 @@ __bio_ioctxt_open(struct bio_io_context **pctxt, struct bio_xs_context *xs_ctxt, D_INIT_LIST_HEAD(&ctxt->bic_link); ctxt->bic_xs_ctxt = xs_ctxt; uuid_copy(ctxt->bic_pool_id, uuid); + ctxt->bic_blob_id = SPDK_BLOBID_INVALID; bxb = bio_xs_context2xs_blobstore(xs_ctxt, st); D_ASSERT(bxb != NULL); @@ -1005,6 +1006,7 @@ bio_blob_close(struct bio_io_context *ctxt, bool async) ba->bca_inflights = 1; bma->bma_ioc = ctxt; bma->bma_async = async; + ctxt->bic_blob_id = spdk_blob_get_id(ctxt->bic_blob); spdk_thread_send_msg(owner_thread(bbs), blob_msg_close, bma); if (async) diff --git a/src/bio/bio_device.c b/src/bio/bio_device.c index 7edaf09cbf9d..a73ad77a5668 100644 --- a/src/bio/bio_device.c +++ b/src/bio/bio_device.c @@ -460,7 +460,10 @@ alloc_ctrlr_info(uuid_t dev_id, char *dev_name, struct bio_dev_info *b_info) D_ASSERT(b_info->bdi_ctrlr == NULL); if (dev_name == NULL) { - D_DEBUG(DB_MGMT, "missing bdev device name, skipping ctrlr info fetch\n"); + D_DEBUG(DB_MGMT, + "missing bdev device name for device " DF_UUID ", skipping ctrlr " + "info fetch\n", + DP_UUID(dev_id)); return 0; } @@ -606,11 +609,46 @@ struct led_opts { int status; }; +static Ctl__LedState +led_state_spdk2daos(enum spdk_vmd_led_state in) +{ + switch (in) { + case SPDK_VMD_LED_STATE_OFF: + return CTL__LED_STATE__OFF; + case SPDK_VMD_LED_STATE_IDENTIFY: + return CTL__LED_STATE__QUICK_BLINK; + case SPDK_VMD_LED_STATE_FAULT: + return CTL__LED_STATE__ON; + case SPDK_VMD_LED_STATE_REBUILD: + return CTL__LED_STATE__SLOW_BLINK; + default: + return CTL__LED_STATE__NA; + } +} + +static enum spdk_vmd_led_state +led_state_daos2spdk(Ctl__LedState in) +{ + switch (in) { + case CTL__LED_STATE__OFF: + return SPDK_VMD_LED_STATE_OFF; + case CTL__LED_STATE__QUICK_BLINK: + return SPDK_VMD_LED_STATE_IDENTIFY; + case CTL__LED_STATE__ON: + return SPDK_VMD_LED_STATE_FAULT; + case CTL__LED_STATE__SLOW_BLINK: + return SPDK_VMD_LED_STATE_REBUILD; + default: + return SPDK_VMD_LED_STATE_UNKNOWN; + } +} + static void led_device_action(void *ctx, struct spdk_pci_device *pci_device) { struct led_opts *opts = ctx; enum spdk_vmd_led_state cur_led_state; + Ctl__LedState d_led_state; const char *pci_dev_type = NULL; char addr_buf[ADDR_STR_MAX_LEN + 1]; int rc; @@ -656,14 +694,17 @@ led_device_action(void *ctx, struct spdk_pci_device *pci_device) return; } + /* Convert state to Ctl__LedState from SPDK led_state */ + d_led_state = led_state_spdk2daos(cur_led_state); + D_DEBUG(DB_MGMT, "led on dev %s has state: %s (action: %s, new state: %s)\n", addr_buf, - LED_STATE_NAME(cur_led_state), LED_ACTION_NAME(opts->action), + LED_STATE_NAME(d_led_state), LED_ACTION_NAME(opts->action), LED_STATE_NAME(opts->led_state)); switch (opts->action) { case CTL__LED_ACTION__GET: /* Return early with current device state set */ - opts->led_state = (Ctl__LedState)cur_led_state; + opts->led_state = d_led_state; return; case CTL__LED_ACTION__SET: break; @@ -678,14 +719,14 @@ led_device_action(void *ctx, struct spdk_pci_device *pci_device) return; } - if (cur_led_state == (enum spdk_vmd_led_state)opts->led_state) { + if (d_led_state == opts->led_state) { D_DEBUG(DB_MGMT, "VMD device %s LED state already in state %s\n", addr_buf, LED_STATE_NAME(opts->led_state)); return; } /* Set the LED to the new state */ - rc = spdk_vmd_set_led_state(pci_device, (enum spdk_vmd_led_state)opts->led_state); + rc = spdk_vmd_set_led_state(pci_device, led_state_daos2spdk(opts->led_state)); if (spdk_unlikely(rc != 0)) { D_ERROR("Failed to set the VMD LED state on %s (%s)\n", addr_buf, spdk_strerror(-rc)); @@ -700,11 +741,12 @@ led_device_action(void *ctx, struct spdk_pci_device *pci_device) opts->status = -DER_NOSYS; return; } + d_led_state = led_state_spdk2daos(cur_led_state); /* Verify the correct state is set */ - if (cur_led_state != (enum spdk_vmd_led_state)opts->led_state) { + if (d_led_state != opts->led_state) { D_ERROR("Unexpected LED state on %s, want %s got %s\n", addr_buf, - LED_STATE_NAME(opts->led_state), LED_STATE_NAME(cur_led_state)); + LED_STATE_NAME(opts->led_state), LED_STATE_NAME(d_led_state)); opts->status = -DER_INVAL; } } diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index 5bd02cd386c6..bae492ff9084 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -376,6 +376,7 @@ struct bio_xs_blobstore { struct bio_blobstore *bxb_blobstore; /* All I/O contexts for this xstream blobstore */ d_list_t bxb_io_ctxts; + bool bxb_ready; }; /* Per-xstream NVMe context */ @@ -391,6 +392,7 @@ struct bio_xs_context { struct bio_io_context { d_list_t bic_link; /* link to bxb_io_ctxts */ struct spdk_blob *bic_blob; + spdk_blob_id bic_blob_id; struct bio_xs_blobstore *bic_xs_blobstore; struct bio_xs_context *bic_xs_ctxt; uint32_t bic_inflight_dmas; @@ -656,11 +658,17 @@ int fill_in_traddr(struct bio_dev_info *b_info, char *dev_name); /* bio_config.c */ int - bio_add_allowed_alloc(const char *nvme_conf, struct spdk_env_opts *opts, int *roles, - bool *vmd_enabled); -int bio_set_hotplug_filter(const char *nvme_conf); -int bio_read_accel_props(const char *nvme_conf); -int bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock_addr); +bio_add_allowed_alloc(const char *nvme_conf, struct spdk_env_opts *opts, int *roles, + bool *vmd_enabled); +int +bio_set_hotplug_filter(const char *nvme_conf); +int +bio_read_accel_props(const char *nvme_conf); +int +bio_read_rpc_srv_settings(const char *nvme_conf, bool *enable, const char **sock_addr); +int +bio_read_auto_faulty_criteria(const char *nvme_conf, bool *enable, uint32_t *max_io_errs, + uint32_t *max_csum_errs); int bio_decode_bdev_params(struct bio_dev_info *b_info, const void *json, int json_size); #endif /* __BIO_INTERNAL_H__ */ diff --git a/src/bio/bio_recovery.c b/src/bio/bio_recovery.c index bd6632903d10..bc779046b6ce 100644 --- a/src/bio/bio_recovery.c +++ b/src/bio/bio_recovery.c @@ -60,6 +60,8 @@ on_faulty(struct bio_blobstore *bbs) static void teardown_xs_bs(void *arg) { + struct bio_io_context *ioc; + int opened_blobs = 0; struct bio_xs_blobstore *bxb = arg; D_ASSERT(bxb != NULL); @@ -72,8 +74,23 @@ teardown_xs_bs(void *arg) if (bxb->bxb_io_channel == NULL) return; - /* Blobs (VOS pools) should have been close on faulty reaction */ - D_ASSERT(d_list_empty(&bxb->bxb_io_ctxts)); + /* When a normal device is unplugged, the opened blobs need be closed here */ + d_list_for_each_entry(ioc, &bxb->bxb_io_ctxts, bic_link) { + if (ioc->bic_blob == NULL && ioc->bic_opening == 0) + continue; + + opened_blobs++; + if (ioc->bic_closing || ioc->bic_opening) + continue; + + bio_blob_close(ioc, true); + } + + if (opened_blobs) { + D_DEBUG(DB_MGMT, "blobstore:%p has %d opened blobs\n", + bxb->bxb_blobstore, opened_blobs); + return; + } /* Put the io channel */ if (bxb->bxb_io_channel != NULL) { @@ -158,6 +175,7 @@ on_teardown(struct bio_blobstore *bbs) continue; D_ASSERT(xs_ctxt->bxc_thread != NULL); + bxb->bxb_ready = false; spdk_thread_send_msg(xs_ctxt->bxc_thread, teardown_xs_bs, bxb); rc += 1; } @@ -199,8 +217,10 @@ on_teardown(struct bio_blobstore *bbs) static void setup_xs_bs(void *arg) { + struct bio_io_context *ioc; struct bio_xs_blobstore *bxb = arg; struct bio_blobstore *bbs; + int closed_blobs = 0; D_ASSERT(bxb != NULL); if (!is_server_started()) { @@ -223,9 +243,35 @@ setup_xs_bs(void *arg) D_ERROR("Failed to create io channel for %p\n", bbs); return; } - /* Blobs (VOS pools) will be opened in reint reaction */ + } + + /* If reint will be tirggered later, blobs will be opened in reint reaction */ + if (bbs->bb_dev->bb_trigger_reint) { D_ASSERT(d_list_empty(&bxb->bxb_io_ctxts)); + goto done; + } + + /* Open all blobs when reint won't be tirggered */ + d_list_for_each_entry(ioc, &bxb->bxb_io_ctxts, bic_link) { + if (ioc->bic_blob != NULL && !ioc->bic_closing) + continue; + + closed_blobs += 1; + if (ioc->bic_opening || ioc->bic_closing) + continue; + + D_ASSERT(ioc->bic_blob_id != SPDK_BLOBID_INVALID); + /* device type and flags will be ignored in bio_blob_open() */ + bio_blob_open(ioc, true, 0, SMD_DEV_TYPE_MAX, ioc->bic_blob_id); + } + + if (closed_blobs) { + D_DEBUG(DB_MGMT, "blobstore:%p has %d closed blobs\n", + bbs, closed_blobs); + return; } +done: + bxb->bxb_ready = true; } static void @@ -325,7 +371,7 @@ on_setup(struct bio_blobstore *bbs) D_ASSERT(bxb != NULL); /* Setup for the per-xsteam blobstore is done */ - if (bxb->bxb_io_channel != NULL) + if (bxb->bxb_ready) continue; D_ASSERT(xs_ctxt->bxc_thread != NULL); diff --git a/src/bio/bio_xstream.c b/src/bio/bio_xstream.c index 1b57241a8447..15a6b2270ca0 100644 --- a/src/bio/bio_xstream.c +++ b/src/bio/bio_xstream.c @@ -89,15 +89,66 @@ struct bio_nvme_data { }; static struct bio_nvme_data nvme_glb; +struct bio_faulty_criteria glb_criteria; static int -bio_spdk_env_init(void) +bio_spdk_conf_read(struct spdk_env_opts *opts) { - struct spdk_env_opts opts; bool enable_rpc_srv = false; bool vmd_enabled = false; - int rc; int roles = 0; + int rc; + + rc = bio_add_allowed_alloc(nvme_glb.bd_nvme_conf, opts, &roles, &vmd_enabled); + if (rc != 0) { + DL_ERROR(rc, "Failed to add allowed devices to SPDK env"); + return rc; + } + nvme_glb.bd_nvme_roles = roles; + bio_vmd_enabled = vmd_enabled; + + rc = bio_set_hotplug_filter(nvme_glb.bd_nvme_conf); + if (rc != 0) { + DL_ERROR(rc, "Failed to set hotplug filter"); + return rc; + } + + rc = bio_read_accel_props(nvme_glb.bd_nvme_conf); + if (rc != 0) { + DL_ERROR(rc, "Failed to read acceleration properties"); + return rc; + } + + rc = bio_read_rpc_srv_settings(nvme_glb.bd_nvme_conf, &enable_rpc_srv, + &nvme_glb.bd_rpc_srv_addr); + if (rc != 0) { + DL_ERROR(rc, "Failed to read SPDK JSON-RPC server settings"); + return rc; + } +#ifdef DAOS_BUILD_RELEASE + if (enable_rpc_srv) { + D_ERROR("SPDK JSON-RPC server may not be enabled for release builds.\n"); + return -DER_INVAL; + } +#endif + nvme_glb.bd_enable_rpc_srv = enable_rpc_srv; + + rc = bio_read_auto_faulty_criteria(nvme_glb.bd_nvme_conf, &glb_criteria.fc_enabled, + &glb_criteria.fc_max_io_errs, + &glb_criteria.fc_max_csum_errs); + if (rc != 0) { + DL_ERROR(rc, "Failed to read NVMe auto-faulty criteria"); + return rc; + } + + return 0; +} + +static int +bio_spdk_env_init(void) +{ + struct spdk_env_opts opts; + int rc; /* Only print error and more severe to stderr. */ spdk_log_set_print_level(SPDK_LOG_ERROR); @@ -114,45 +165,11 @@ bio_spdk_env_init(void) */ if (bio_nvme_configured(SMD_DEV_TYPE_MAX)) { - rc = bio_add_allowed_alloc(nvme_glb.bd_nvme_conf, &opts, &roles, &vmd_enabled); + rc = bio_spdk_conf_read(&opts); if (rc != 0) { - D_ERROR("Failed to add allowed devices to SPDK env, "DF_RC"\n", - DP_RC(rc)); + DL_ERROR(rc, "Failed to process nvme config"); goto out; } - nvme_glb.bd_nvme_roles = roles; - bio_vmd_enabled = vmd_enabled; - - rc = bio_set_hotplug_filter(nvme_glb.bd_nvme_conf); - if (rc != 0) { - D_ERROR("Failed to set hotplug filter, "DF_RC"\n", DP_RC(rc)); - goto out; - } - - rc = bio_read_accel_props(nvme_glb.bd_nvme_conf); - if (rc != 0) { - D_ERROR("Failed to read acceleration properties, "DF_RC"\n", DP_RC(rc)); - goto out; - } - - /** - * Read flag to indicate whether to enable the SPDK JSON-RPC server and the - * socket file address from the JSON config used to initialize SPDK subsystems. - */ - rc = bio_read_rpc_srv_settings(nvme_glb.bd_nvme_conf, &enable_rpc_srv, - &nvme_glb.bd_rpc_srv_addr); - if (rc != 0) { - D_ERROR("Failed to read SPDK JSON-RPC server settings, "DF_RC"\n", - DP_RC(rc)); - goto out; - } -#ifdef DAOS_BUILD_RELEASE - if (enable_rpc_srv) { - D_ERROR("SPDK JSON-RPC server may not be enabled for release builds.\n"); - D_GOTO(out, rc = -DER_INVAL); - } -#endif - nvme_glb.bd_enable_rpc_srv = enable_rpc_srv; } rc = spdk_env_init(&opts); @@ -181,29 +198,6 @@ bypass_health_collect() return nvme_glb.bd_bypass_health_collect; } -struct bio_faulty_criteria glb_criteria; - -/* TODO: Make it configurable through control plane */ -static inline void -set_faulty_criteria(void) -{ - glb_criteria.fc_enabled = true; - glb_criteria.fc_max_io_errs = 10; - /* - * FIXME: Don't enable csum error criterion for now, otherwise, targets - * be unexpectedly down in CSUM tests. - */ - glb_criteria.fc_max_csum_errs = UINT32_MAX; - - d_getenv_bool("DAOS_NVME_AUTO_FAULTY_ENABLED", &glb_criteria.fc_enabled); - d_getenv_uint32_t("DAOS_NVME_AUTO_FAULTY_IO", &glb_criteria.fc_max_io_errs); - d_getenv_uint32_t("DAOS_NVME_AUTO_FAULTY_CSUM", &glb_criteria.fc_max_csum_errs); - - D_INFO("NVMe auto faulty is %s. Criteria: max_io_errs:%u, max_csum_errs:%u\n", - glb_criteria.fc_enabled ? "enabled" : "disabled", - glb_criteria.fc_max_io_errs, glb_criteria.fc_max_csum_errs); -} - int bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size, unsigned int hugepage_size, unsigned int tgt_nr, bool bypass_health_collect) @@ -242,6 +236,14 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size, goto free_mutex; } + glb_criteria.fc_enabled = true; + glb_criteria.fc_max_io_errs = 10; + /* + * FIXME: Don't enable csum error criterion by default otherwise targets will be + * unexpectedly down in CSUM tests. + */ + glb_criteria.fc_max_csum_errs = UINT32_MAX; + bio_chk_cnt_init = DAOS_DMA_CHUNK_CNT_INIT; bio_chk_cnt_max = DAOS_DMA_CHUNK_CNT_MAX; bio_chk_sz = ((uint64_t)size_mb << 20) >> BIO_DMA_PAGE_SHIFT; @@ -291,11 +293,12 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size, nvme_glb.bd_bs_opts.cluster_sz = DAOS_BS_CLUSTER_SZ; nvme_glb.bd_bs_opts.max_channel_ops = BIO_BS_MAX_CHANNEL_OPS; - env = getenv("VOS_BDEV_CLASS"); + d_agetenv_str(&env, "VOS_BDEV_CLASS"); if (env && strcasecmp(env, "AIO") == 0) { D_WARN("AIO device(s) will be used!\n"); nvme_glb.bd_bdev_class = BDEV_CLASS_AIO; } + d_freeenv_str(&env); if (numa_node > 0) { bio_numa_node = (unsigned int)numa_node; @@ -332,7 +335,6 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size, bio_nvme_configured(SMD_DEV_TYPE_META) ? "enabled" : "disabled"); bio_spdk_inited = true; - set_faulty_criteria(); return 0; diff --git a/src/bio/smd.pb-c.c b/src/bio/smd.pb-c.c index b3ed3284385c..de49e886e192 100644 --- a/src/bio/smd.pb-c.c +++ b/src/bio/smd.pb-c.c @@ -2833,19 +2833,19 @@ const ProtobufCEnumDescriptor ctl__nvme_dev_state__descriptor = }; static const ProtobufCEnumValue ctl__led_state__enum_values_by_number[5] = { - { "OFF", "CTL__LED_STATE__OFF", 0 }, + { "NA", "CTL__LED_STATE__NA", 0 }, { "QUICK_BLINK", "CTL__LED_STATE__QUICK_BLINK", 1 }, { "ON", "CTL__LED_STATE__ON", 2 }, { "SLOW_BLINK", "CTL__LED_STATE__SLOW_BLINK", 3 }, - { "NA", "CTL__LED_STATE__NA", 4 }, + { "OFF", "CTL__LED_STATE__OFF", 4 }, }; static const ProtobufCIntRange ctl__led_state__value_ranges[] = { {0, 0},{0, 5} }; static const ProtobufCEnumValueIndex ctl__led_state__enum_values_by_name[5] = { - { "NA", 4 }, - { "OFF", 0 }, + { "NA", 0 }, + { "OFF", 4 }, { "ON", 2 }, { "QUICK_BLINK", 1 }, { "SLOW_BLINK", 3 }, diff --git a/src/bio/smd.pb-c.h b/src/bio/smd.pb-c.h index 19ac9fc3d149..fd4ca542b604 100644 --- a/src/bio/smd.pb-c.h +++ b/src/bio/smd.pb-c.h @@ -66,9 +66,9 @@ typedef enum _Ctl__NvmeDevState { } Ctl__NvmeDevState; typedef enum _Ctl__LedState { /* - * Equivalent to SPDK_VMD_LED_STATE_OFF + * Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) */ - CTL__LED_STATE__OFF = 0, + CTL__LED_STATE__NA = 0, /* * Equivalent to SPDK_VMD_LED_STATE_IDENTIFY (4Hz blink) */ @@ -82,9 +82,9 @@ typedef enum _Ctl__LedState { */ CTL__LED_STATE__SLOW_BLINK = 3, /* - * Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) + * Equivalent to SPDK_VMD_LED_STATE_OFF */ - CTL__LED_STATE__NA = 4 + CTL__LED_STATE__OFF = 4 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(CTL__LED_STATE) } Ctl__LedState; typedef enum _Ctl__LedAction { @@ -305,7 +305,7 @@ struct _Ctl__NvmeController }; #define CTL__NVME_CONTROLLER__INIT \ { PROTOBUF_C_MESSAGE_INIT (&ctl__nvme_controller__descriptor) \ - , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0,NULL, 0,NULL, CTL__NVME_DEV_STATE__UNKNOWN, CTL__LED_STATE__OFF, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string } + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0,NULL, 0,NULL, CTL__NVME_DEV_STATE__UNKNOWN, CTL__LED_STATE__NA, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string } /* @@ -558,7 +558,7 @@ struct _Ctl__LedManageReq }; #define CTL__LED_MANAGE_REQ__INIT \ { PROTOBUF_C_MESSAGE_INIT (&ctl__led_manage_req__descriptor) \ - , (char *)protobuf_c_empty_string, CTL__LED_ACTION__GET, CTL__LED_STATE__OFF, 0 } + , (char *)protobuf_c_empty_string, CTL__LED_ACTION__GET, CTL__LED_STATE__NA, 0 } struct _Ctl__DevReplaceReq diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 6c3805a78c92..9aca5a578a3f 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -793,8 +793,7 @@ crt_hg_free_protocol_info(struct na_protocol_info *na_protocol_info) int crt_hg_init(void) { - int rc = 0; - char *env; + int rc = 0; if (crt_initialized()) { D_ERROR("CaRT already initialized.\n"); @@ -803,10 +802,8 @@ crt_hg_init(void) #define EXT_FAC DD_FAC(external) - env = getenv("HG_LOG_SUBSYS"); - if (!env) { - env = getenv("HG_LOG_LEVEL"); - if (!env) + if (!d_isenv_def("HG_LOG_SUBSYS")) { + if (!d_isenv_def("HG_LOG_LEVEL")) HG_Set_log_level("warning"); HG_Set_log_subsys("hg,na"); } diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 8359afa49037..aa5716e71b52 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -18,6 +18,44 @@ static volatile int gdata_init_flag; struct crt_plugin_gdata crt_plugin_gdata; static bool g_prov_settings_applied[CRT_PROV_COUNT]; +/* List of the environment variables used in CaRT */ +static const char *crt_env_names[] = {"D_PROVIDER", + "D_INTERFACE", + "D_DOMAIN", + "D_PORT", + "CRT_PHY_ADDR_STR", + "D_LOG_STDERR_IN_LOG", + "D_LOG_SIZE", + "D_LOG_FILE", + "D_LOG_FILE_APPEND_PID", + "D_LOG_MASK", + "DD_MASK", + "DD_STDERR", + "DD_SUBSYS", + "CRT_TIMEOUT", + "CRT_ATTACH_INFO_PATH", + "OFI_PORT", + "OFI_INTERFACE", + "OFI_DOMAIN", + "CRT_CREDIT_EP_CTX", + "CRT_CTX_SHARE_ADDR", + "CRT_CTX_NUM", + "D_FI_CONFIG", + "FI_UNIVERSE_SIZE", + "CRT_ENABLE_MEM_PIN", + "FI_OFI_RXM_USE_SRX", + "D_LOG_FLUSH", + "CRT_MRC_ENABLE", + "CRT_SECONDARY_PROVIDER", + "D_PROVIDER_AUTH_KEY", + "D_PORT_AUTO_ADJUST", + "D_POLL_TIMEOUT", + "D_LOG_FILE_APPEND_RANK", + "D_QUOTA_RPCS", + "D_POST_INIT", + "D_POST_INCR", + "DAOS_SIGNAL_REGISTER"}; + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -62,52 +100,20 @@ crt_lib_fini(void) static void dump_envariables(void) { - int i; - char *val; - char *envars[] = {"D_PROVIDER", - "D_INTERFACE", - "D_DOMAIN", - "D_PORT", - "CRT_PHY_ADDR_STR", - "D_LOG_STDERR_IN_LOG", - "D_LOG_SIZE", - "D_LOG_FILE", - "D_LOG_FILE_APPEND_PID", - "D_LOG_MASK", - "DD_MASK", - "DD_STDERR", - "DD_SUBSYS", - "CRT_TIMEOUT", - "CRT_ATTACH_INFO_PATH", - "OFI_PORT", - "OFI_INTERFACE", - "OFI_DOMAIN", - "CRT_CREDIT_EP_CTX", - "CRT_CTX_SHARE_ADDR", - "CRT_CTX_NUM", - "D_FI_CONFIG", - "FI_UNIVERSE_SIZE", - "CRT_ENABLE_MEM_PIN", - "FI_OFI_RXM_USE_SRX", - "D_LOG_FLUSH", - "CRT_MRC_ENABLE", - "CRT_SECONDARY_PROVIDER", - "D_PROVIDER_AUTH_KEY", - "D_PORT_AUTO_ADJUST", - "D_POLL_TIMEOUT", - "D_LOG_FILE_APPEND_RANK", - "D_QUOTA_RPCS", - "D_POST_INIT", - "D_POST_INCR", - "DAOS_SIGNAL_REGISTER"}; + int i; D_INFO("-- ENVARS: --\n"); - for (i = 0; i < ARRAY_SIZE(envars); i++) { - val = getenv(envars[i]); - if (strcmp(envars[i], "D_PROVIDER_AUTH_KEY") == 0 && val) - D_INFO("%s = %s\n", envars[i], "********"); + for (i = 0; i < ARRAY_SIZE(crt_env_names); i++) { + char *val = NULL; + + d_agetenv_str(&val, crt_env_names[i]); + if (val == NULL) + continue; + if (strcmp(crt_env_names[i], "D_PROVIDER_AUTH_KEY") == 0) + D_INFO("%s = %s\n", crt_env_names[i], "********"); else - D_INFO("%s = %s\n", envars[i], val); + D_INFO("%s = %s\n", crt_env_names[i], val); + d_freeenv_str(&val); } } @@ -596,41 +602,40 @@ crt_protocol_info_free(struct crt_protocol_info *protocol_info) int crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) { - char *provider_env; - char *interface_env; - char *domain_env; - char *auth_key_env; - char *tmp; - struct timeval now; - unsigned int seed; - const char *path; - bool server; - int rc = 0; - char *provider_str0 = NULL; - char *provider_str1 = NULL; - crt_provider_t primary_provider; - crt_provider_t secondary_provider; - crt_provider_t tmp_prov; - char *port_str, *port0, *port1; - char *iface0, *iface1, *domain0, *domain1; - char *auth_key0, *auth_key1; - int num_secondaries = 0; - bool port_auto_adjust = false; - int i; + char *provider; + char *provider_env = NULL; + char *interface; + char *interface_env = NULL; + char *domain; + char *domain_env = NULL; + char *auth_key; + char *auth_key_env = NULL; + struct timeval now; + unsigned int seed; + char *path; + bool server = flags & CRT_FLAG_BIT_SERVER; + int rc = 0; + char *provider_str0 = NULL; + char *provider_str1 = NULL; + crt_provider_t primary_provider; + crt_provider_t secondary_provider; + crt_provider_t tmp_prov; + char *port; + char *port_env = NULL; + char *port0 = NULL; + char *port1 = NULL; + char *iface0 = NULL; + char *iface1 = NULL; + char *domain0 = NULL; + char *domain1 = NULL; + char *auth_key0 = NULL; + char *auth_key1 = NULL; + int num_secondaries = 0; + bool port_auto_adjust = false; + int i; d_signal_register(); - server = flags & CRT_FLAG_BIT_SERVER; - port_str = NULL; - port0 = NULL; - port1 = NULL; - iface0 = NULL; - iface1 = NULL; - domain0 = NULL; - domain1 = NULL; - auth_key0 = NULL; - auth_key1 = NULL; - /* d_log_init is reference counted */ rc = d_log_init(); if (rc != 0) { @@ -677,7 +682,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) crt_gdata.cg_auto_swim_disable = (flags & CRT_FLAG_BIT_AUTO_SWIM_DISABLE) ? 1 : 0; - path = getenv("CRT_ATTACH_INFO_PATH"); + d_agetenv_str(&path, "CRT_ATTACH_INFO_PATH"); if (path != NULL && strlen(path) > 0) { rc = crt_group_config_path_set(path); if (rc != 0) @@ -687,55 +692,55 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) else D_DEBUG(DB_ALL, "set group_config_path as %s.\n", path); } + d_freeenv_str(&path); if (opt && opt->cio_auth_key) - auth_key_env = opt->cio_auth_key; - else - auth_key_env = getenv("D_PROVIDER_AUTH_KEY"); + auth_key = opt->cio_auth_key; + else { + d_agetenv_str(&auth_key_env, "D_PROVIDER_AUTH_KEY"); + auth_key = auth_key_env; + } if (opt && opt->cio_provider) - provider_env = opt->cio_provider; + provider = opt->cio_provider; else { - provider_env = getenv(CRT_PHY_ADDR_ENV); - - tmp = getenv("D_PROVIDER"); - if (tmp) - provider_env = tmp; + d_agetenv_str(&provider_env, "D_PROVIDER"); + if (provider_env == NULL) + d_agetenv_str(&provider_env, CRT_PHY_ADDR_ENV); + provider = provider_env; } if (opt && opt->cio_interface) - interface_env = opt->cio_interface; + interface = opt->cio_interface; else { - interface_env = getenv("OFI_INTERFACE"); - - tmp = getenv("D_INTERFACE"); - if (tmp) - interface_env = tmp; + d_agetenv_str(&interface_env, "D_INTERFACE"); + if (interface_env == NULL) { + d_agetenv_str(&interface_env, "OFI_INTERFACE"); + } + interface = interface_env; } if (opt && opt->cio_domain) - domain_env = opt->cio_domain; + domain = opt->cio_domain; else { - domain_env = getenv("OFI_DOMAIN"); - - tmp = getenv("D_DOMAIN"); - if (tmp) - domain_env = tmp; + d_agetenv_str(&domain_env, "D_DOMAIN"); + if (domain_env == NULL) + d_agetenv_str(&domain_env, "OFI_DOMAIN"); + domain = domain_env; } if (opt && opt->cio_port) - port_str = opt->cio_port; + port = opt->cio_port; else { - port_str = getenv("OFI_PORT"); - - tmp = getenv("D_PORT"); - if (tmp) - port_str = tmp; + d_agetenv_str(&port_env, "D_PORT"); + if (port_env == NULL) + d_agetenv_str(&port_env, "OFI_PORT"); + port = port_env; } d_getenv_bool("D_PORT_AUTO_ADJUST", &port_auto_adjust); - rc = __split_arg(provider_env, &provider_str0, &provider_str1); + rc = __split_arg(provider, &provider_str0, &provider_str1); if (rc != 0) D_GOTO(unlock, rc); @@ -743,20 +748,20 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) secondary_provider = crt_str_to_provider(provider_str1); if (primary_provider == CRT_PROV_UNKNOWN) { - D_ERROR("Requested provider %s not found\n", provider_env); + D_ERROR("Requested provider %s not found\n", provider); D_GOTO(unlock, rc = -DER_NONEXIST); } - rc = __split_arg(interface_env, &iface0, &iface1); + rc = __split_arg(interface, &iface0, &iface1); if (rc != 0) D_GOTO(unlock, rc); - rc = __split_arg(domain_env, &domain0, &domain1); + rc = __split_arg(domain, &domain0, &domain1); if (rc != 0) D_GOTO(unlock, rc); - rc = __split_arg(port_str, &port0, &port1); + rc = __split_arg(port, &port0, &port1); if (rc != 0) D_GOTO(unlock, rc); - rc = __split_arg(auth_key_env, &auth_key0, &auth_key1); + rc = __split_arg(auth_key, &auth_key0, &auth_key1); if (rc != 0) D_GOTO(unlock, rc); @@ -896,6 +901,11 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_FREE(domain0); D_FREE(provider_str0); D_FREE(auth_key0); + d_freeenv_str(&port_env); + d_freeenv_str(&domain_env); + d_freeenv_str(&interface_env); + d_freeenv_str(&provider_env); + d_freeenv_str(&auth_key_env); if (rc != 0) { D_ERROR("failed, "DF_RC"\n", DP_RC(rc)); diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index af3226facd82..59ad504993ff 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -3508,8 +3508,8 @@ crt_iv_update_internal(crt_iv_namespace_t ivns, uint32_t class_id, D_GOTO(exit, rc); } else { - DL_CDEBUG(rc == -DER_NONEXIST || rc == -DER_NOTLEADER, DLOG_INFO, DLOG_ERR, rc, - "ivo_on_update failed"); + DL_CDEBUG(rc == -DER_NONEXIST || rc == -DER_NOTLEADER || rc == -DER_BUSY, + DLOG_INFO, DLOG_ERR, rc, "ivo_on_update failed"); update_comp_cb(ivns, class_id, iv_key, NULL, iv_value, rc, cb_arg); diff --git a/src/cart/utils/crt_utils.c b/src/cart/utils/crt_utils.c index 5e1a7582c901..6f7cee03c384 100644 --- a/src/cart/utils/crt_utils.c +++ b/src/cart/utils/crt_utils.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -101,11 +101,12 @@ write_completion_file(void) char *dir; char *completion_file = NULL; - dir = getenv("DAOS_TEST_SHARED_DIR"); + d_agetenv_str(&dir, "DAOS_TEST_SHARED_DIR"); D_ASSERTF(dir != NULL, "DAOS_TEST_SHARED_DIR must be set for --write_completion_file " "option.\n"); D_ASPRINTF(completion_file, "%s/test-servers-completed.txt.%d", dir, getpid()); + d_freeenv_str(&dir); D_ASSERTF(completion_file != NULL, "Error allocating completion_file string\n"); unlink(completion_file); @@ -412,12 +413,15 @@ crtu_dc_mgmt_net_cfg_rank_add(const char *name, crt_group_t *group, int crtu_dc_mgmt_net_cfg_setenv(const char *name) { - int rc; - char buf[SYS_INFO_BUF_SIZE]; - char *crt_timeout; - char *ofi_interface; - char *ofi_domain; - char *cli_srx_set; + int rc; + char *crt_phy_addr_str; + char *crt_ctx_share_addr = NULL; + char *cli_srx_set = NULL; + char *crt_timeout = NULL; + char *ofi_interface; + char *ofi_interface_env = NULL; + char *ofi_domain; + char *ofi_domain_env = NULL; struct dc_mgmt_sys_info crt_net_cfg_info = {0}; Mgmt__GetAttachInfoResp *crt_net_cfg_resp = NULL; @@ -433,29 +437,38 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) } /* These two are always set */ - D_INFO("setenv CRT_PHY_ADDR_STR=%s\n", crt_net_cfg_info.provider); - rc = d_setenv("CRT_PHY_ADDR_STR", crt_net_cfg_info.provider, 1); + crt_phy_addr_str = crt_net_cfg_info.provider; + D_INFO("setenv CRT_PHY_ADDR_STR=%s\n", crt_phy_addr_str); + rc = d_setenv("CRT_PHY_ADDR_STR", crt_phy_addr_str, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); - sprintf(buf, "%d", crt_net_cfg_info.crt_ctx_share_addr); - D_INFO("setenv CRT_CTX_SHARE_ADDR=%d\n", crt_net_cfg_info.crt_ctx_share_addr); - rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1); + rc = asprintf(&crt_ctx_share_addr, "%d", crt_net_cfg_info.crt_ctx_share_addr); + if (rc < 0) { + crt_ctx_share_addr = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + D_INFO("setenv CRT_CTX_SHARE_ADDR=%s\n", crt_ctx_share_addr); + rc = d_setenv("CRT_CTX_SHARE_ADDR", crt_ctx_share_addr, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); /* If the server has set this, the client must use the same value. */ if (crt_net_cfg_info.srv_srx_set != -1) { - sprintf(buf, "%d", crt_net_cfg_info.srv_srx_set); - rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1); - D_INFO("setenv FI_OFI_RXM_USE_SRX=%d\n", crt_net_cfg_info.srv_srx_set); + rc = asprintf(&cli_srx_set, "%d", crt_net_cfg_info.srv_srx_set); + if (rc < 0) { + cli_srx_set = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + D_INFO("setenv FI_OFI_RXM_USE_SRX=%s\n", cli_srx_set); + rc = d_setenv("FI_OFI_RXM_USE_SRX", cli_srx_set, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); - D_DEBUG(DB_MGMT, "Using server's value for FI_OFI_RXM_USE_SRX: %s\n", buf); + D_DEBUG(DB_MGMT, "Using server's value for FI_OFI_RXM_USE_SRX: %s\n", cli_srx_set); } else { /* Client may not set it if the server hasn't. */ - cli_srx_set = getenv("FI_OFI_RXM_USE_SRX"); + d_agetenv_str(&cli_srx_set, "FI_OFI_RXM_USE_SRX"); if (cli_srx_set) { D_ERROR("Client set FI_OFI_RXM_USE_SRX to %s, " "but server is unset!\n", cli_srx_set); @@ -464,47 +477,58 @@ crtu_dc_mgmt_net_cfg_setenv(const char *name) } /* Allow client env overrides for these three */ - crt_timeout = getenv("CRT_TIMEOUT"); + d_agetenv_str(&crt_timeout, "CRT_TIMEOUT"); if (!crt_timeout) { - sprintf(buf, "%d", crt_net_cfg_info.crt_timeout); - rc = d_setenv("CRT_TIMEOUT", buf, 1); - D_INFO("setenv CRT_TIMEOUT=%d\n", crt_net_cfg_info.crt_timeout); + rc = asprintf(&crt_timeout, "%d", crt_net_cfg_info.crt_timeout); + if (rc < 0) { + crt_timeout = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + D_INFO("setenv CRT_TIMEOUT=%s\n", crt_timeout); + rc = d_setenv("CRT_TIMEOUT", crt_timeout, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { D_DEBUG(DB_MGMT, "Using client provided CRT_TIMEOUT: %s\n", crt_timeout); } - ofi_interface = getenv("OFI_INTERFACE"); - if (!ofi_interface) { - rc = d_setenv("OFI_INTERFACE", crt_net_cfg_info.interface, 1); - D_INFO("Setting OFI_INTERFACE=%s\n", crt_net_cfg_info.interface); + d_agetenv_str(&ofi_interface_env, "OFI_INTERFACE"); + if (!ofi_interface_env) { + ofi_interface = crt_net_cfg_info.interface; + D_INFO("Setting OFI_INTERFACE=%s\n", ofi_interface); + rc = d_setenv("OFI_INTERFACE", ofi_interface, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { + ofi_interface = ofi_interface_env; D_DEBUG(DB_MGMT, "Using client provided OFI_INTERFACE: %s\n", ofi_interface); } - ofi_domain = getenv("OFI_DOMAIN"); - if (!ofi_domain) { - rc = d_setenv("OFI_DOMAIN", crt_net_cfg_info.domain, 1); - D_INFO("Setting OFI_DOMAIN=%s\n", crt_net_cfg_info.domain); + d_agetenv_str(&ofi_domain_env, "OFI_DOMAIN"); + if (!ofi_domain_env) { + ofi_domain = crt_net_cfg_info.domain; + D_INFO("Setting OFI_DOMAIN=%s\n", ofi_domain); + rc = d_setenv("OFI_DOMAIN", ofi_domain, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { + ofi_domain = ofi_domain_env; D_DEBUG(DB_MGMT, "Using client provided OFI_DOMAIN: %s\n", ofi_domain); } D_INFO("CaRT env setup with:\n" - "\tOFI_INTERFACE=%s, OFI_DOMAIN: %s, CRT_PHY_ADDR_STR: %s, " - "CRT_CTX_SHARE_ADDR: %s, CRT_TIMEOUT: %s\n", - getenv("OFI_INTERFACE"), getenv("OFI_DOMAIN"), - getenv("CRT_PHY_ADDR_STR"), - getenv("CRT_CTX_SHARE_ADDR"), getenv("CRT_TIMEOUT")); + "\tOFI_INTERFACE=%s, OFI_DOMAIN: %s, CRT_PHY_ADDR_STR: %s, " + "CRT_CTX_SHARE_ADDR: %s, CRT_TIMEOUT: %s\n", + ofi_interface, ofi_domain, crt_phy_addr_str, crt_ctx_share_addr, crt_timeout); cleanup: + d_freeenv_str(&ofi_domain_env); + d_freeenv_str(&ofi_interface_env); + d_freeenv_str(&crt_timeout); + d_freeenv_str(&cli_srx_set); + d_freeenv_str(&crt_ctx_share_addr); dc_put_attach_info(&crt_net_cfg_info, crt_net_cfg_resp); return rc; @@ -575,7 +599,7 @@ crtu_cli_start_basic(char *local_group_name, char *srv_group_name, if (*grp == NULL) D_GOTO(out, rc = -DER_INVAL); - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); /* load group info from a config file and * delete file upon return @@ -583,6 +607,7 @@ crtu_cli_start_basic(char *local_group_name, char *srv_group_name, rc = crtu_load_group_from_file(grp_cfg_file, *crt_ctx, *grp, -1, true); + d_freeenv_str(&grp_cfg_file); if (rc != 0) D_GOTO(out, rc); } @@ -644,7 +669,6 @@ crtu_srv_start_basic(char *srv_group_name, crt_context_t *crt_ctx, pthread_t *progress_thread, crt_group_t **grp, uint32_t *grp_size, crt_init_options_t *init_opt) { - char *env_self_rank; char *grp_cfg_file; char *my_uri; d_rank_t my_rank; @@ -653,8 +677,8 @@ crtu_srv_start_basic(char *srv_group_name, crt_context_t *crt_ctx, if (opts.assert_on_error) D_ASSERTF(opts.is_initialized == true, "crtu_test_init not called.\n"); - env_self_rank = getenv("CRT_L_RANK"); - my_rank = atoi(env_self_rank); + rc = d_getenv_uint32_t("CRT_L_RANK", &my_rank); + D_ASSERTF(rc == DER_SUCCESS, "Rank can not be retrieve: " DF_RC "\n", DP_RC(rc)); rc = d_log_init(); if (rc != 0) @@ -695,19 +719,19 @@ crtu_srv_start_basic(char *srv_group_name, crt_context_t *crt_ctx, D_GOTO(out, rc); } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); - rc = crt_rank_uri_get(*grp, my_rank, 0, &my_uri); if (rc != 0) D_GOTO(out, rc); + D_FREE(my_uri); + + rc = d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); /* load group info from a config file and delete file upon return */ rc = crtu_load_group_from_file(grp_cfg_file, crt_ctx[0], *grp, my_rank, true); + d_freeenv_str(&grp_cfg_file); if (rc != 0) D_GOTO(out, rc); - D_FREE(my_uri); - rc = crt_group_size(NULL, grp_size); if (rc != 0) D_GOTO(out, rc); diff --git a/src/client/api/agent.c b/src/client/api/agent.c index 365ba908ef62..ede098eb1655 100644 --- a/src/client/api/agent.c +++ b/src/client/api/agent.c @@ -12,14 +12,16 @@ char *dc_agent_sockpath; int dc_agent_init() { - char *path = NULL; - char *envpath = getenv(DAOS_AGENT_DRPC_DIR_ENV); + char *path = NULL; + char *envpath; - if (envpath) + d_agetenv_str(&envpath, DAOS_AGENT_DRPC_DIR_ENV); + if (envpath != NULL) D_ASPRINTF(path, "%s/%s", envpath, DAOS_AGENT_DRPC_SOCK_NAME); else D_STRNDUP_S(path, DEFAULT_DAOS_AGENT_DRPC_SOCK); + d_freeenv_str(&envpath); if (path == NULL) return -DER_NOMEM; diff --git a/src/client/api/job.c b/src/client/api/job.c index e4a330f455c6..c184c7708401 100644 --- a/src/client/api/job.c +++ b/src/client/api/job.c @@ -37,22 +37,24 @@ int dc_job_init(void) { char *jobid; - char *jobid_env = getenv(JOBID_ENV); + char *jobid_env; int err = 0; + d_agetenv_str(&jobid_env, JOBID_ENV); if (jobid_env == NULL) { D_STRNDUP_S(jobid_env, DEFAULT_JOBID_ENV); } else { char *tmp_env = jobid_env; D_STRNDUP(jobid_env, tmp_env, MAX_ENV_NAME); + d_freeenv_str(&tmp_env); } if (jobid_env == NULL) D_GOTO(out_err, err = -DER_NOMEM); dc_jobid_env = jobid_env; - jobid = getenv(dc_jobid_env); + d_agetenv_str(&jobid, dc_jobid_env); if (jobid == NULL) { err = craft_default_jobid(&jobid); if (err) @@ -61,6 +63,7 @@ dc_job_init(void) char *tmp_jobid = jobid; D_STRNDUP(jobid, tmp_jobid, MAX_JOBID_LEN); + d_freeenv_str(&tmp_jobid); if (jobid == NULL) D_GOTO(out_env, err = -DER_NOMEM); } diff --git a/src/client/dfs/dfs.c b/src/client/dfs/dfs.c index a95acd9d31bf..1956455bb2eb 100644 --- a/src/client/dfs/dfs.c +++ b/src/client/dfs/dfs.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -568,7 +568,7 @@ fetch_entry(dfs_layout_ver_t ver, daos_handle_t oh, daos_handle_t th, const char /** TODO - not supported yet */ if (strcmp(name, ".") == 0) - D_ASSERT(0); + return ENOTSUP; if (xnr) { D_ALLOC_ARRAY(pxnames, xnr); @@ -3416,13 +3416,10 @@ lookup_rel_path(dfs_t *dfs, dfs_obj_t *root, const char *path, int flags, lookup_rel_path_loop: /* - * Open the directory object one level up. - * Since fetch_entry does not support ".", - * we can't support ".." as the last entry, - * nor can we support "../.." because we don't - * have parent.parent_oid and parent.mode. - * For now, represent this partial state with - * parent_fully_valid. + * Open the directory object one level up. Since fetch_entry does not support ".", + * we can't support ".." as the last entry, nor can we support "../.." because we + * don't have parent.parent_oid and parent.mode. For now, represent this partial + * state with parent_fully_valid. */ parent_fully_valid = true; if (strcmp(token, "..") == 0) { @@ -3508,15 +3505,23 @@ lookup_rel_path(dfs_t *dfs, dfs_obj_t *root, const char *path, int flags, } if (stbuf) { - daos_size_t size; + daos_array_stbuf_t array_stbuf = {0}; - rc = daos_array_get_size(obj->oh, DAOS_TX_NONE, &size, NULL); + rc = daos_array_stat(obj->oh, DAOS_TX_NONE, &array_stbuf, NULL); if (rc) { daos_array_close(obj->oh, NULL); D_GOTO(err_obj, rc = daos_der2errno(rc)); } - stbuf->st_size = size; + + stbuf->st_size = array_stbuf.st_size; stbuf->st_blocks = (stbuf->st_size + (1 << 9) - 1) >> 9; + + rc = update_stbuf_times(entry, array_stbuf.st_max_epoch, stbuf, + NULL); + if (rc) { + daos_array_close(obj->oh, NULL); + D_GOTO(err_obj, rc); + } } break; } @@ -3617,14 +3622,28 @@ lookup_rel_path(dfs_t *dfs, dfs_obj_t *root, const char *path, int flags, } obj->d.chunk_size = entry.chunk_size; - obj->d.oclass = entry.oclass; - if (stbuf) - stbuf->st_size = sizeof(entry); - + obj->d.oclass = entry.oclass; oid_cp(&parent.oid, obj->oid); oid_cp(&parent.parent_oid, obj->parent_oid); parent.oh = obj->oh; parent.mode = entry.mode; + + if (stbuf) { + daos_epoch_t ep; + + rc = daos_obj_query_max_epoch(obj->oh, DAOS_TX_NONE, &ep, NULL); + if (rc) { + daos_obj_close(obj->oh, NULL); + D_GOTO(err_obj, rc = daos_der2errno(rc)); + } + + rc = update_stbuf_times(entry, ep, stbuf, NULL); + if (rc) { + daos_obj_close(obj->oh, NULL); + D_GOTO(err_obj, rc = daos_der2errno(rc)); + } + stbuf->st_size = sizeof(entry); + } } if (mode) @@ -3632,16 +3651,49 @@ lookup_rel_path(dfs_t *dfs, dfs_obj_t *root, const char *path, int flags, if (stbuf) { if (is_root) { + daos_epoch_t ep; + + /** refresh possibly stale root stbuf */ + rc = fetch_entry(dfs->layout_v, dfs->super_oh, DAOS_TX_NONE, "/", 1, false, + &exists, &entry, 0, NULL, NULL, NULL); + if (rc) { + D_ERROR("fetch_entry() failed: %d (%s)\n", rc, strerror(rc)); + D_GOTO(err_obj, rc); + } + + if (!exists || !S_ISDIR(entry.mode)) { + /** something really bad happened! */ + D_ERROR("Root object corrupted!"); + D_GOTO(err_obj, rc = EIO); + } + + if (mode) + *mode = entry.mode; + dfs->root_stbuf.st_mode = entry.mode; + dfs->root_stbuf.st_uid = entry.uid; + dfs->root_stbuf.st_gid = entry.gid; + + rc = daos_obj_query_max_epoch(dfs->root.oh, DAOS_TX_NONE, &ep, NULL); + if (rc) + D_GOTO(err_obj, rc = daos_der2errno(rc)); + + /** object was updated since creation */ + rc = update_stbuf_times(entry, ep, &dfs->root_stbuf, NULL); + if (rc) + D_GOTO(err_obj, rc); + if (tspec_gt(dfs->root_stbuf.st_ctim, dfs->root_stbuf.st_mtim)) { + dfs->root_stbuf.st_atim.tv_sec = entry.ctime; + dfs->root_stbuf.st_atim.tv_nsec = entry.ctime_nano; + } else { + dfs->root_stbuf.st_atim.tv_sec = entry.mtime; + dfs->root_stbuf.st_atim.tv_nsec = entry.mtime_nano; + } memcpy(stbuf, &dfs->root_stbuf, sizeof(struct stat)); } else { stbuf->st_nlink = 1; stbuf->st_mode = obj->mode; stbuf->st_uid = entry.uid; - stbuf->st_gid = entry.gid; - stbuf->st_mtim.tv_sec = entry.mtime; - stbuf->st_mtim.tv_nsec = entry.mtime_nano; - stbuf->st_ctim.tv_sec = entry.ctime; - stbuf->st_ctim.tv_nsec = entry.ctime_nano; + stbuf->st_gid = entry.gid; if (tspec_gt(stbuf->st_ctim, stbuf->st_mtim)) { stbuf->st_atim.tv_sec = entry.ctime; stbuf->st_atim.tv_nsec = entry.ctime_nano; @@ -3738,11 +3790,8 @@ readdir_int(dfs_t *dfs, dfs_obj_t *obj, daos_anchor_t *anchor, uint32_t *nr, D_GOTO(out, rc = daos_der2errno(rc)); for (ptr = enum_buf, i = 0; i < number; i++) { - int len; - - len = snprintf(dirs[key_nr].d_name, - kds[i].kd_key_len + 1, "%s", ptr); - D_ASSERT(len >= kds[i].kd_key_len); + memcpy(dirs[key_nr].d_name, ptr, kds[i].kd_key_len); + dirs[key_nr].d_name[kds[i].kd_key_len] = '\0'; ptr += kds[i].kd_key_len; /** stat the entry if requested */ @@ -5408,11 +5457,12 @@ dfs_osetattr(dfs_t *dfs, dfs_obj_t *obj, struct stat *stbuf, int flags) bool set_size = false; bool set_mtime = false; bool set_ctime = false; - int i = 0; - size_t len; - int rc; + int i = 0, hlc_recx_idx = 0; + size_t len; uint64_t obj_hlc = 0; struct stat rstat = {}; + daos_array_stbuf_t array_stbuf = {0}; + int rc; if (dfs == NULL || !dfs->mounted) return EINVAL; @@ -5509,6 +5559,10 @@ dfs_osetattr(dfs_t *dfs, dfs_obj_t *obj, struct stat *stbuf, int flags) d_iov_set(&sg_iovs[i], &obj_hlc, sizeof(uint64_t)); recxs[i].rx_idx = HLC_IDX; recxs[i].rx_nr = sizeof(uint64_t); + if (flags & DFS_SET_ATTR_SIZE) { + /** we need to update this again after the set size */ + hlc_recx_idx = i; + } i++; set_mtime = true; @@ -5558,38 +5612,41 @@ dfs_osetattr(dfs_t *dfs, dfs_obj_t *obj, struct stat *stbuf, int flags) rstat.st_blocks = (stbuf->st_size + (1 << 9) - 1) >> 9; rstat.st_size = stbuf->st_size; - /* mtime and ctime need to be updated too only if not set earlier */ - if (!set_mtime || !set_ctime) { - daos_array_stbuf_t array_stbuf = {0}; + /** + * if mtime is set, we need to to just update the hlc on the entry. if mtime and/or + * ctime were not set, we need to update the stat buf returned. both cases require + * an array stat for the hlc. + */ + /** TODO - need an array API to just stat the max epoch without size */ + rc = daos_array_stat(obj->oh, th, &array_stbuf, NULL); + if (rc) + D_GOTO(out_obj, rc = daos_der2errno(rc)); - /** TODO - need an array API to just stat the max epoch without size */ - rc = daos_array_stat(obj->oh, th, &array_stbuf, NULL); - if (rc) + if (!set_mtime) { + rc = d_hlc2timespec(array_stbuf.st_max_epoch, &rstat.st_mtim); + if (rc) { + D_ERROR("d_hlc2timespec() failed " DF_RC "\n", DP_RC(rc)); D_GOTO(out_obj, rc = daos_der2errno(rc)); - - if (!set_mtime) { - rc = d_hlc2timespec(array_stbuf.st_max_epoch, &rstat.st_mtim); - if (rc) { - D_ERROR("d_hlc2timespec() failed "DF_RC"\n", DP_RC(rc)); - D_GOTO(out_obj, rc = daos_der2errno(rc)); - } } + } else { + D_ASSERT(hlc_recx_idx > 0); + D_ASSERT(recxs[hlc_recx_idx].rx_idx == HLC_IDX); + d_iov_set(&sg_iovs[hlc_recx_idx], &array_stbuf.st_max_epoch, + sizeof(uint64_t)); + } - if (!set_ctime) { - rc = d_hlc2timespec(array_stbuf.st_max_epoch, &rstat.st_ctim); - if (rc) { - D_ERROR("d_hlc2timespec() failed "DF_RC"\n", DP_RC(rc)); - D_GOTO(out_obj, rc = daos_der2errno(rc)); - } + if (!set_ctime) { + rc = d_hlc2timespec(array_stbuf.st_max_epoch, &rstat.st_ctim); + if (rc) { + D_ERROR("d_hlc2timespec() failed " DF_RC "\n", DP_RC(rc)); + D_GOTO(out_obj, rc = daos_der2errno(rc)); } } } iod.iod_nr = i; - if (i == 0) D_GOTO(out_stat, rc = 0); - sgl.sg_nr = i; sgl.sg_nr_out = 0; sgl.sg_iovs = &sg_iovs[0]; @@ -6482,8 +6539,6 @@ dfs_listxattr(dfs_t *dfs, dfs_obj_t *obj, char *list, daos_size_t *size) continue; for (ptr = enum_buf, i = 0; i < number; i++) { - int len; - if (strncmp("x:", ptr, 2) != 0) { ptr += kds[i].kd_key_len; continue; @@ -6496,10 +6551,8 @@ dfs_listxattr(dfs_t *dfs, dfs_obj_t *obj, char *list, daos_size_t *size) if (list_size < kds[i].kd_key_len - 2) continue; - len = snprintf(ptr_list, kds[i].kd_key_len - 1, "%s", - ptr + 2); - D_ASSERT(len >= kds[i].kd_key_len - 2); - + memcpy(ptr_list, ptr + 2, kds[i].kd_key_len - 2); + ptr_list[kds[i].kd_key_len - 2] = '\0'; list_size -= kds[i].kd_key_len - 1; ptr_list += kds[i].kd_key_len - 1; ptr += kds[i].kd_key_len; @@ -6686,7 +6739,7 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args) } /** open the entry name and get the oid */ - rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL); + rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY | O_NOFOLLOW, &obj, NULL, NULL); if (rc) { D_ERROR("dfs_lookup_rel() of %s failed: %d\n", name, rc); return rc; diff --git a/src/client/dfs/duns.c b/src/client/dfs/duns.c index fadd2f1f7694..9f262857e6fe 100644 --- a/src/client/dfs/duns.c +++ b/src/client/dfs/duns.c @@ -1331,14 +1331,6 @@ duns_destroy_path(daos_handle_t poh, const char *path) return rc; } - /** Destroy the container */ - rc = daos_cont_destroy(poh, dattr.da_cont, 1, NULL); - if (rc) { - D_ERROR("Failed to destroy container (%d)\n", rc); - /** recreate the link ? */ - return daos_der2errno(rc); - } - if (dattr.da_type == DAOS_PROP_CO_LAYOUT_POSIX) { #ifdef LUSTRE_INCLUDE if (dattr.da_on_lustre) @@ -1369,6 +1361,14 @@ duns_destroy_path(daos_handle_t poh, const char *path) } } + /** Destroy the container */ + rc = daos_cont_destroy(poh, dattr.da_cont, 1, NULL); + if (rc) { + D_ERROR("Failed to destroy container (%d)\n", rc); + /** recreate the link ? */ + return daos_der2errno(rc); + } + return 0; } diff --git a/src/client/dfuse/SConscript b/src/client/dfuse/SConscript index bafe87431f09..70ed85d151d3 100644 --- a/src/client/dfuse/SConscript +++ b/src/client/dfuse/SConscript @@ -6,6 +6,7 @@ COMMON_SRC = ['dfuse_obj_da.c', 'dfuse_vector.c'] DFUSE_SRC = ['dfuse_core.c', 'dfuse_main.c', 'dfuse_fuseops.c', + 'inval.c', 'dfuse_cont.c', 'dfuse_thread.c', 'dfuse_pool.c'] @@ -190,7 +191,7 @@ def scons(): dfuse_env = env.Clone(LIBS=[]) dfuse_env.AppendUnique(CPPPATH=[Dir('.').srcnode()]) dfuse_env.AppendUnique(CFLAGS=['-pthread']) - dfuse_env.AppendUnique(LIBS=['pthread', 'daos', 'daos_common', 'uuid']) + dfuse_env.AppendUnique(LIBS=['pthread', 'daos', 'daos_common', 'uuid', 'm']) gcc_env = il_env.Clone(LIBS=[]) gcc_env.AppendUnique(CPPPATH=[Dir('.').srcnode()]) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 2820c39bdd47..e55738d47cfd 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -28,7 +28,7 @@ struct dfuse_info { struct fuse_session *di_session; char *di_group; char *di_mountpoint; - uint32_t di_thread_count; + int32_t di_thread_count; uint32_t di_eq_count; bool di_threaded; bool di_foreground; @@ -782,13 +782,14 @@ struct fuse_lowlevel_ops dfuse_ops; DS_ERROR(-__rc, "fuse_reply_open() error"); \ } while (0) -#define DFUSE_REPLY_CREATE(_ie, req, entry, fi) \ +#define DFUSE_REPLY_CREATE(inode, req, entry, fi) \ do { \ int __rc; \ - DFUSE_TRA_DEBUG(_ie, "Returning create"); \ - _Static_assert(IS_IE(_ie), "Param is not inode entry"); \ - (_ie) = NULL; \ - __rc = fuse_reply_create(req, &entry, fi); \ + DFUSE_TRA_DEBUG(inode, "Returning create"); \ + ival_update_inode(inode, (entry).entry_timeout); \ + _Static_assert(IS_IE(inode), "Param is not inode entry"); \ + (inode) = NULL; \ + __rc = fuse_reply_create(req, &entry, fi); \ if (__rc != 0) \ DS_ERROR(-__rc, "fuse_reply_create() error"); \ } while (0) @@ -796,13 +797,13 @@ struct fuse_lowlevel_ops dfuse_ops; #define DFUSE_REPLY_ENTRY(inode, req, entry) \ do { \ int __rc; \ - DFUSE_TRA_DEBUG(inode, "Returning entry inode %#lx mode %#o size %#zx", \ - (entry).attr.st_ino, (entry).attr.st_mode, (entry).attr.st_size); \ if ((entry).attr_timeout > 0) { \ (inode)->ie_stat = (entry).attr; \ dfuse_mcache_set_time(inode); \ } \ - DFUSE_TRA_DEBUG(inode, "Returning entry inode %#lx mode %#o size %zi timeout %lf", \ + ival_update_inode(inode, (entry).entry_timeout); \ + DFUSE_TRA_DEBUG(inode, \ + "Returning entry inode %#lx mode %#o size %#zx timeout %lf", \ (entry).attr.st_ino, (entry).attr.st_mode, (entry).attr.st_size, \ (entry).attr_timeout); \ (inode) = NULL; \ @@ -811,6 +812,19 @@ struct fuse_lowlevel_ops dfuse_ops; DS_ERROR(-__rc, "fuse_reply_entry() error"); \ } while (0) +#define DFUSE_REPLY_NO_ENTRY(parent, req, timeout) \ + do { \ + int __rc; \ + struct fuse_entry_param _entry = {}; \ + _entry.entry_timeout = timeout; \ + DFUSE_TRA_DEBUG(parent, "Returning negative entry parent %#lx timeout %lf", \ + (parent)->ie_stat.st_ino, _entry.entry_timeout); \ + (parent) = NULL; \ + __rc = fuse_reply_entry(req, &_entry); \ + if (__rc != 0) \ + DS_ERROR(-__rc, "fuse_reply_entry() error"); \ + } while (0) + #define DFUSE_REPLY_STATFS(_ie, req, stat) \ do { \ int __rc; \ @@ -884,6 +898,9 @@ struct dfuse_inode_entry { /* Time of last kernel cache metadata update */ struct timespec ie_mcache_last_update; + /* Time of last kernel cache dentry update */ + struct timespec ie_dentry_last_update; + /* Time of last kernel cache data update, also used for kernel readdir caching. */ struct timespec ie_dcache_last_update; @@ -920,6 +937,9 @@ struct dfuse_inode_entry { * Checked on open of a file to determine if pre-caching is used. */ ATOMIC bool ie_linear_read; + + /* Entry on the evict list */ + d_list_t ie_evict_entry; }; /* Lookup an inode and take a ref on it. */ @@ -1018,6 +1038,30 @@ dfuse_mcache_evict(struct dfuse_inode_entry *ie); bool dfuse_mcache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout); +/* Check the dentry cache setting against a given timeout, and return time left */ +bool +dfuse_dentry_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout); + +/* inval.c */ + +int +ival_add_cont_buckets(struct dfuse_cont *dfc); + +void +ival_drop_inode(struct dfuse_inode_entry *inode); + +int +ival_update_inode(struct dfuse_inode_entry *inode, double timeout); + +int +ival_init(struct dfuse_info *dfuse_info); + +int +ival_thread_start(struct dfuse_info *dfuse_info); + +void +ival_thread_stop(); + /* Data caching functions */ /* Mark the data cache as up-to-date from now */ diff --git a/src/client/dfuse/dfuse_cont.c b/src/client/dfuse/dfuse_cont.c index 888c6ef52f11..e76b9e95a18f 100644 --- a/src/client/dfuse/dfuse_cont.c +++ b/src/client/dfuse/dfuse_cont.c @@ -31,10 +31,8 @@ dfuse_cont_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * * lookups. */ if (uuid_parse(name, cont) < 0) { - struct fuse_entry_param entry = {.entry_timeout = 60}; - DFUSE_TRA_DEBUG(parent, "Invalid container uuid"); - DFUSE_REPLY_ENTRY(parent, req, entry); + DFUSE_REPLY_NO_ENTRY(parent, req, 60); return; } @@ -97,12 +95,8 @@ dfuse_cont_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * decref: d_hash_rec_decref(&dfp->dfp_cont_table, &dfc->dfs_entry); err: - if (rc == ENOENT) { - struct fuse_entry_param entry = {0}; - - entry.entry_timeout = parent->ie_dfs->dfc_ndentry_timeout; - DFUSE_REPLY_ENTRY(parent, req, entry); - } else { + if (rc == ENOENT) + DFUSE_REPLY_NO_ENTRY(parent, req, parent->ie_dfs->dfc_ndentry_timeout); + else DFUSE_REPLY_ERR_RAW(parent, req, rc); - } } diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index cb73d179e662..d5f254781fae 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -723,10 +723,15 @@ dfuse_cont_open_by_label(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, } else if (rc != 0) { D_GOTO(err_close, rc); } + } else { DFUSE_TRA_INFO(dfc, "Caching disabled"); } + rc = ival_add_cont_buckets(dfc); + if (rc) + goto err_close; + rc = dfuse_cont_open(dfuse_info, dfp, &c_info.ci_uuid, &dfc); if (rc) { D_FREE(dfc); @@ -805,9 +810,9 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, uuid_t *c /* Turn on some caching of metadata, otherwise container * operations will be very frequent */ - dfc->dfc_attr_timeout = 60; - dfc->dfc_dentry_dir_timeout = 60; - dfc->dfc_ndentry_timeout = 60; + dfc->dfc_attr_timeout = 60 * 5; + dfc->dfc_dentry_dir_timeout = 60 * 5; + dfc->dfc_ndentry_timeout = 60 * 5; } else if (*_dfc == NULL) { char str[37]; @@ -845,9 +850,15 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, uuid_t *c } else if (rc != 0) { D_GOTO(err_umount, rc); } + } else { DFUSE_TRA_INFO(dfc, "Caching disabled"); } + + rc = ival_add_cont_buckets(dfc); + if (rc != 0) + goto err_umount; + } else { /* This is either a container where a label is set on the * command line, or one created through mkdir, in either case @@ -948,6 +959,38 @@ dfuse_mcache_get_valid(struct dfuse_inode_entry *ie, double max_age, double *tim return use; } +bool +dfuse_dentry_get_valid(struct dfuse_inode_entry *ie, double max_age, double *timeout) +{ + bool use = false; + struct timespec now; + struct timespec left; + double time_left; + + D_ASSERT(max_age != -1); + D_ASSERT(max_age >= 0); + + if (ie->ie_dentry_last_update.tv_sec == 0) + return false; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + + left.tv_sec = now.tv_sec - ie->ie_dentry_last_update.tv_sec; + left.tv_nsec = now.tv_nsec - ie->ie_dentry_last_update.tv_nsec; + if (left.tv_nsec < 0) { + left.tv_sec--; + left.tv_nsec += 1000000000; + } + time_left = max_age - (left.tv_sec + ((double)left.tv_nsec / 1000000000)); + if (time_left > 0) + use = true; + + if (use && timeout) + *timeout = time_left; + + return use; +} + /* Set a timer to mark cache entry as valid */ void dfuse_dcache_set_time(struct dfuse_inode_entry *ie) @@ -1029,6 +1072,10 @@ dfuse_fs_init(struct dfuse_info *dfuse_info) if (rc != 0) D_GOTO(err_pt, rc); + rc = ival_init(dfuse_info); + if (rc != 0) + D_GOTO(err_it, rc = d_errno2der(rc)); + atomic_init(&dfuse_info->di_ino_next, 2); atomic_init(&dfuse_info->di_eqt_idx, 0); @@ -1081,6 +1128,9 @@ dfuse_fs_init(struct dfuse_info *dfuse_info) sem_destroy(&eqt->de_sem); DFUSE_TRA_DOWN(eqt); } + + ival_thread_stop(); +err_it: d_hash_table_destroy_inplace(&dfuse_info->dpi_iet, false); err_pt: d_hash_table_destroy_inplace(&dfuse_info->di_pool_table, false); @@ -1110,7 +1160,7 @@ dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) atomic_init(&ie->ie_open_write_count, 0); atomic_init(&ie->ie_il_count, 0); atomic_fetch_add_relaxed(&dfuse_info->di_inode_count, 1); - + D_INIT_LIST_HEAD(&ie->ie_evict_entry); D_MUTEX_INIT(&ie->ie_lock, NULL); } @@ -1120,6 +1170,8 @@ dfuse_ie_close(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) int rc; uint32_t ref; + ival_drop_inode(ie); + ref = atomic_load_relaxed(&ie->ie_ref); DFUSE_TRA_DEBUG(ie, "closing, inode %#lx ref %u, name " DF_DE ", parent %#lx", ie->ie_stat.st_ino, ref, DP_DE(ie->ie_name), ie->ie_parent); @@ -1458,6 +1510,9 @@ dfuse_fs_stop(struct dfuse_info *dfuse_info) sem_post(&eqt->de_sem); } + /* Stop and drain invalidation queues */ + ival_thread_stop(); + for (i = 0; i < dfuse_info->di_eq_count; i++) { struct dfuse_eq *eqt = &dfuse_info->di_eqt[i]; diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c index 1ef48600a6fb..0258c4dbf23d 100644 --- a/src/client/dfuse/dfuse_main.c +++ b/src/client/dfuse/dfuse_main.c @@ -194,6 +194,10 @@ dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args) return -DER_INVAL; } + rc = ival_thread_start(dfuse_info); + if (rc != 0) + D_GOTO(umount, rc = daos_errno2der(rc)); + rc = dfuse_send_to_fg(0); if (rc != -DER_SUCCESS) DFUSE_TRA_ERROR(dfuse_info, "Error sending signal to fg: "DF_RC, DP_RC(rc)); @@ -206,6 +210,8 @@ dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args) if (rc != 0) DHS_ERROR(dfuse_info, rc, "Fuse loop exited"); +umount: + fuse_session_unmount(dfuse_info->di_session); return daos_errno2der(rc); @@ -521,7 +527,7 @@ main(int argc, char **argv) } } - if (!dfuse_info->di_foreground && getenv("PMIX_RANK")) { + if (!dfuse_info->di_foreground && d_isenv_def("PMIX_RANK")) { DFUSE_TRA_WARNING(dfuse_info, "Not running in background under orterun"); dfuse_info->di_foreground = true; @@ -717,6 +723,8 @@ main(int argc, char **argv) out_pool: d_hash_rec_decref(&dfuse_info->di_pool_table, &dfp->dfp_entry); out_daos: + ival_thread_stop(); + rc2 = dfuse_fs_fini(dfuse_info); if (rc == -DER_SUCCESS) rc = rc2; diff --git a/src/client/dfuse/dfuse_pool.c b/src/client/dfuse/dfuse_pool.c index 745d2c9b7bf8..1ec2a79a0716 100644 --- a/src/client/dfuse/dfuse_pool.c +++ b/src/client/dfuse/dfuse_pool.c @@ -38,10 +38,8 @@ dfuse_pool_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * * lookups. */ if (uuid_parse(name, pool) < 0) { - struct fuse_entry_param entry = {.entry_timeout = 60}; - DFUSE_TRA_DEBUG(parent, "Invalid pool uuid"); - DFUSE_REPLY_ENTRY(parent, req, entry); + DFUSE_REPLY_NO_ENTRY(parent, req, 60); return; } @@ -134,12 +132,8 @@ dfuse_pool_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char * dfuse_ie_free(dfuse_info, ie); daos_prop_free(prop); err: - if (rc == ENOENT) { - struct fuse_entry_param entry = {0}; - - entry.entry_timeout = parent->ie_dfs->dfc_ndentry_timeout; - DFUSE_REPLY_ENTRY(parent, req, entry); - } else { + if (rc == ENOENT) + DFUSE_REPLY_NO_ENTRY(parent, req, parent->ie_dfs->dfc_ndentry_timeout); + else DFUSE_REPLY_ERR_RAW(parent, req, rc); - } } diff --git a/src/client/dfuse/il/int_read.c b/src/client/dfuse/il/int_read.c index 24f6be3051d9..346cd14c288d 100644 --- a/src/client/dfuse/il/int_read.c +++ b/src/client/dfuse/il/int_read.c @@ -13,21 +13,15 @@ #include "ioil.h" static ssize_t -read_bulk(char *buff, size_t len, off_t position, struct fd_entry *entry, int *errcode) +read_bulksgl(d_sg_list_t *sgl, size_t len, off_t position, struct fd_entry *entry, int *errcode) { - daos_size_t read_size = 0; - d_iov_t iov = {}; - d_sg_list_t sgl = {}; + daos_size_t read_size = 0; daos_event_t ev; daos_handle_t eqh; int rc; DFUSE_TRA_DEBUG(entry->fd_dfsoh, "%#zx-%#zx", position, position + len - 1); - sgl.sg_nr = 1; - d_iov_set(&iov, (void *)buff, len); - sgl.sg_iovs = &iov; - rc = ioil_get_eqh(&eqh); if (rc == 0) { bool flag = false; @@ -39,8 +33,8 @@ read_bulk(char *buff, size_t len, off_t position, struct fd_entry *entry, int *e D_GOTO(out, rc = daos_der2errno(rc)); } - rc = dfs_read(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, &sgl, position, - &read_size, &ev); + rc = dfs_read(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, sgl, position, &read_size, + &ev); if (rc) D_GOTO(out, rc); @@ -57,7 +51,7 @@ read_bulk(char *buff, size_t len, off_t position, struct fd_entry *entry, int *e } rc = ev.ev_error; } else { - rc = dfs_read(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, &sgl, position, &read_size, + rc = dfs_read(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, sgl, position, &read_size, NULL); } out: @@ -72,29 +66,47 @@ read_bulk(char *buff, size_t len, off_t position, struct fd_entry *entry, int *e ssize_t ioil_do_pread(char *buff, size_t len, off_t position, struct fd_entry *entry, int *errcode) { - return read_bulk(buff, len, position, entry, errcode); + d_iov_t iov = {}; + d_sg_list_t sgl = {}; + + sgl.sg_nr = 1; + d_iov_set(&iov, (void *)buff, len); + sgl.sg_iovs = &iov; + + return read_bulksgl(&sgl, len, position, entry, errcode); } ssize_t ioil_do_preadv(const struct iovec *iov, int count, off_t position, struct fd_entry *entry, int *errcode) { - ssize_t bytes_read; - ssize_t total_read = 0; - int i; + d_iov_t *diov; + d_sg_list_t sgl = {}; + ssize_t total_read = 0; + int i; + int rc; + int new_count; + + D_ALLOC_ARRAY(diov, count); + if (diov == NULL) { + *errcode = ENOMEM; + return -1; + } - for (i = 0; i < count; i++) { - bytes_read = read_bulk(iov[i].iov_base, iov[i].iov_len, position, entry, errcode); + for (i = 0, new_count = 0; i < count; i++) { + /** See DAOS-15089. This is a workaround */ + if (iov[i].iov_len == 0) + continue; + d_iov_set(&diov[new_count++], iov[i].iov_base, iov[i].iov_len); + total_read += iov[i].iov_len; + } - if (bytes_read == -1) - return (ssize_t)-1; + sgl.sg_nr = new_count; + sgl.sg_iovs = diov; - if (bytes_read == 0) - return total_read; + rc = read_bulksgl(&sgl, total_read, position, entry, errcode); - position += bytes_read; - total_read += bytes_read; - } + D_FREE(diov); - return total_read; + return rc; } diff --git a/src/client/dfuse/il/int_write.c b/src/client/dfuse/il/int_write.c index 2de4b3a44603..c95e23e0909b 100644 --- a/src/client/dfuse/il/int_write.c +++ b/src/client/dfuse/il/int_write.c @@ -14,21 +14,15 @@ #include "ioil.h" -ssize_t -ioil_do_writex(const char *buff, size_t len, off_t position, struct fd_entry *entry, int *errcode) +static ssize_t +ioil_do_writesgl(d_sg_list_t *sgl, size_t len, off_t position, struct fd_entry *entry, int *errcode) { - d_iov_t iov = {}; - d_sg_list_t sgl = {}; daos_event_t ev; daos_handle_t eqh; int rc; DFUSE_TRA_DEBUG(entry->fd_dfsoh, "%#zx-%#zx", position, position + len - 1); - sgl.sg_nr = 1; - d_iov_set(&iov, (void *)buff, len); - sgl.sg_iovs = &iov; - rc = ioil_get_eqh(&eqh); if (rc == 0) { bool flag = false; @@ -40,7 +34,7 @@ ioil_do_writex(const char *buff, size_t len, off_t position, struct fd_entry *en D_GOTO(out, rc = daos_der2errno(rc)); } - rc = dfs_write(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, &sgl, position, &ev); + rc = dfs_write(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, sgl, position, &ev); if (rc) D_GOTO(out, rc); @@ -57,7 +51,7 @@ ioil_do_writex(const char *buff, size_t len, off_t position, struct fd_entry *en } rc = ev.ev_error; } else { - rc = dfs_write(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, &sgl, position, NULL); + rc = dfs_write(entry->fd_cont->ioc_dfs, entry->fd_dfsoh, sgl, position, NULL); } out: if (rc) { @@ -68,27 +62,50 @@ ioil_do_writex(const char *buff, size_t len, off_t position, struct fd_entry *en return len; } +ssize_t +ioil_do_writex(const char *buff, size_t len, off_t position, struct fd_entry *entry, int *errcode) +{ + d_iov_t iov = {}; + d_sg_list_t sgl = {}; + + sgl.sg_nr = 1; + d_iov_set(&iov, (void *)buff, len); + sgl.sg_iovs = &iov; + + return ioil_do_writesgl(&sgl, len, position, entry, errcode); +} + ssize_t ioil_do_pwritev(const struct iovec *iov, int count, off_t position, struct fd_entry *entry, int *errcode) { - ssize_t bytes_written; - ssize_t total_write = 0; - int i; + d_iov_t *diov; + d_sg_list_t sgl = {}; + size_t total_write = 0; + int i; + int rc; + int new_count; + + D_ALLOC_ARRAY(diov, count); + if (diov == NULL) { + *errcode = ENOMEM; + return -1; + } - for (i = 0; i < count; i++) { - bytes_written = - ioil_do_writex(iov[i].iov_base, iov[i].iov_len, position, entry, errcode); + for (i = 0, new_count = 0; i < count; i++) { + /** See DAOS-15089. This is a workaround */ + if (iov[i].iov_len == 0) + continue; + d_iov_set(&diov[new_count++], iov[i].iov_base, iov[i].iov_len); + total_write += iov[i].iov_len; + } - if (bytes_written == -1) - return (ssize_t)-1; + sgl.sg_nr = new_count; + sgl.sg_iovs = diov; - if (bytes_written == 0) - return total_write; + rc = ioil_do_writesgl(&sgl, total_write, position, entry, errcode); - position += bytes_written; - total_write += bytes_written; - } + D_FREE(diov); - return total_write; + return rc; } diff --git a/src/client/dfuse/inval.c b/src/client/dfuse/inval.c new file mode 100644 index 000000000000..ffe85e2fa9f7 --- /dev/null +++ b/src/client/dfuse/inval.c @@ -0,0 +1,438 @@ +/** + * (C) Copyright 2016-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include + +#include + +#include "dfuse_common.h" +#include "dfuse.h" + +/* Evict inodes based on timeout. + * + * The goal here is to have dfuse release resources over time, rather than the default which would + * be the kernel keeps a reference on everything until there's memory pressure (effectively + * forever) then instruct the kernel to forget things which have expired. + * + * This reduces both kernel memory and dfuse memory, keeps the hash table much smaller and allows + * dfuse to close containers and disconnect from pools, meaning that at idle dfuse resource + * consumption tends to zero. + * + * For kernel based filesystems there is a benefit to keeping all this data in memory as it can + * simply be re-validated before use however with fuse + DAOS then re-validate is the same cost + * as lookup so there is no benefit in keeping this data around. + * + * Maintain a number of lists for inode timeouts, for each timeout value keep a list of inodes + * that are using that value, when a inode is refreshed by the kernel then move the inode to the end + * of the correct list. + * + * Separately have a thread which periodically will walk each list starting at the front and + * invalidate any entries where the timeout has expired. + * + * In this way the lists are never traversed, on access a entry is removed from where it is and + * appended to the end, and the timeout starts at the front of the list and traverses only as far + * as it needs to until the front entry is to be kept. + * + * As lookups will not be repeated by the kernel until after timeout has expired allow some leeway + * before eviction to allow re-validation of in-use datasets without triggering entire tree + * invalidations through the kernel. Directories get five seconds, anything else two. Ideally + * directories would be invalidated first as this would result in less dfuse->kernel calls as once + * the kernel invalidates directories then it invalidates the whole tree below that, however there + * are also use-cases where there are significiant numbers of files per directory where the + * directory is in active use but individual files are not. + * + * Future work might be to speculatively perform lookups close to the end of the timeout period, + * then if a entry was in frequent use it's lookup could be performed from memory, effectively + * moving the re-validation cost off the critical path. This code currently only handles dentries + * but could also separately track attributes (inodes) and file contents as well. + * + * Additional changes to consider in the future could include: + * Better handing of eviction timeouts, "max(time * 1.1, 10)" would be better than a flat +x/+5 + * Use arrays rather than lists for the buckets for faster iteration. + * Reference counting the timeout buckets. + * + * Locking: The ival_lock is contended, it is accessed several places, however none do any more + * than list management. As inodes might be removed from one list and re-inserted into another + * there is a pre subsystem lock rather than per list locks. + * ie_close() which is called from forget and some failure paths in readdir() + * lookup() to move entries to the end of this list. + * de_run() to pull items from the front of the list. + * + * Wakeup: The invalidation thread is woken up when: + * dfuse is exiting. + * something is added to an empty list. + * after a timeout. + * Timeouts are chosen based on the entries still on any list, dfuse will sleep as long as it can + * but at least 2 seconds and at most 60. + * As this relates to releasing resources there is no additional benefit in finer grained time + * control than this. + */ + +/* Grace period before invalidating directories or non-directories. Needs to be long enough so that + * entries in the working set are invalidated but short enough to be meaningful. + * Directories that are used as the cwd for processes can cause problems with being invalidated too + * early so use a higher value here. + */ +#define INVAL_DIRECTORY_GRACE (60 * 30) +#define INVAL_FILE_GRACE 2 + +/* Represents one timeout value (time). Maintains a ordered list of dentries that are using + * this timeout. + */ +struct dfuse_time_entry { + d_list_t inode_list; + double time; + d_list_t dte_list; +}; + +/* Core data structure, maintains a list of struct dfuse_time_entry lists.*/ +struct dfuse_ival { + d_list_t time_entry_list; + struct fuse_session *session; + bool session_dead; +}; + +/* The core data from struct dfuse_inode_entry. No additional inode references are held on inodes + * because of there place on invalidate lists, rather inodes are removed from any list on close. + * Therefore once a decision is made to evict an inode then a copy of the data is needed as once + * the ival_lock is dropped the inode could be freed. This is not a problem if this happens as the + * kernel will simply return ENOENT. + */ +struct inode_core { + char name[NAME_MAX + 1]; + fuse_ino_t parent; + bool dir; +}; + +/* Number of dentries to invalidate per iteration. This value affects how long the lock is held, + * after the invalidations happen then another iteration will start immediately. Invalidation of + * directories however trigger many forget calls so we want to make use of this where possible so + * keep this batch size small. + */ +#define EVICT_COUNT 8 + +static pthread_mutex_t ival_lock = PTHREAD_MUTEX_INITIALIZER; +static bool ival_stop; +static pthread_t ival_thread; +static sem_t ival_sem; +static struct dfuse_ival ival_data; + +/* Eviction loop, run periodically in it's own thread + * + * Returns true if there is more work to do. If false then *sleep_time is set in seconds. + */ +static bool +ival_loop(int *sleep_time) +{ + struct dfuse_time_entry *dte; + struct inode_core ic[EVICT_COUNT] = {}; + int idx = 0; + double sleep = (60 * 1) - 1; + + D_MUTEX_LOCK(&ival_lock); + + /* Walk the list, oldest first */ + d_list_for_each_entry(dte, &ival_data.time_entry_list, dte_list) { + struct dfuse_inode_entry *inode, *inodep; + + DFUSE_TRA_DEBUG(dte, "Iterating for timeout %lf", dte->time); + + d_list_for_each_entry_safe(inode, inodep, &dte->inode_list, ie_evict_entry) { + double timeout; + + if (dfuse_dentry_get_valid(inode, dte->time, &timeout)) { + DFUSE_TRA_DEBUG(inode, "Keeping left %lf " DF_DE, timeout, + DP_DE(inode->ie_name)); + if (timeout < sleep) + sleep = timeout; + break; + } + + if (atomic_load_relaxed(&inode->ie_open_count) != 0) { + DFUSE_TRA_DEBUG(inode, "File is open " DF_DE, + DP_DE(inode->ie_name)); + continue; + } + + /* Log the mode here, but possibly just evict dirs anyway */ + ic[idx].parent = inode->ie_parent; + strncpy(ic[idx].name, inode->ie_name, NAME_MAX + 1); + ic[idx].name[NAME_MAX] = '\0'; + ic[idx].dir = S_ISDIR(inode->ie_stat.st_mode); + + d_list_del_init(&inode->ie_evict_entry); + + idx++; + + if (idx == EVICT_COUNT) + goto out; + } + } +out: + *sleep_time = (int)round(sleep + 0.5); + + DFUSE_TRA_DEBUG(&ival_data, "Unlocking, allowing to sleep for %d seconds", *sleep_time); + D_MUTEX_UNLOCK(&ival_lock); + + if (idx == 0 || ival_data.session_dead) + return false; + + for (int i = 0; i < idx; i++) { + int rc; + + DFUSE_TRA_DEBUG(&ival_data, "Evicting entry %#lx " DF_DE " dir:" DF_BOOL, + ic[i].parent, DP_DE(ic[i].name), DP_BOOL(ic[i].dir)); + + rc = fuse_lowlevel_notify_inval_entry(ival_data.session, ic[i].parent, ic[i].name, + strnlen(ic[i].name, NAME_MAX)); + if (rc && rc != -ENOENT && rc != -EBADF) + DHS_ERROR(&ival_data, -rc, "notify_inval_entry() failed"); + if (rc == -EBADF) + ival_data.session_dead = true; + } + + return true; +} + +/* Main loop for eviction thread. Spins until ready for exit waking after one second and iterates + * over all newly expired dentries. + */ +static void * +ival_thread_fn(void *arg) +{ + int sleep_time = 1; + + while (1) { + struct timespec ts = {}; + int rc; + + if (clock_gettime(CLOCK_REALTIME, &ts) == -1) + D_ERROR("Unable to set time"); + ts.tv_sec += sleep_time; + + rc = sem_timedwait(&ival_sem, &ts); + if (rc == 0) { + if (ival_stop) + return NULL; + } else { + rc = errno; + + if (errno != ETIMEDOUT) + DS_ERROR(rc, "sem_wait"); + } + + while (ival_loop(&sleep_time)) + ; + if (sleep_time < 2) + sleep_time = 2; + DFUSE_TRA_DEBUG(&ival_data, "Sleeping %d", sleep_time); + } + return NULL; +} + +/* Allocate and insert a new time value entry */ +static int +ival_bucket_add(d_list_t *list, double timeout) +{ + struct dfuse_time_entry *dte; + + D_ALLOC_PTR(dte); + if (dte == NULL) + return ENOMEM; + + DFUSE_TRA_UP(dte, &ival_data, "time bucket"); + + dte->time = timeout; + D_INIT_LIST_HEAD(&dte->inode_list); + + d_list_add_tail(&dte->dte_list, list); + return 0; +} + +/* Sets up the initial data structures, after this ival_add_cont_buckets() may be called before + * ival_thread_start(). + */ +int +ival_init(struct dfuse_info *dfuse_info) +{ + int rc; + + DFUSE_TRA_UP(&ival_data, dfuse_info, "invalidator"); + + D_INIT_LIST_HEAD(&ival_data.time_entry_list); + + rc = sem_init(&ival_sem, 0, 0); + if (rc != 0) + D_GOTO(out, rc = errno); + + rc = ival_bucket_add(&ival_data.time_entry_list, 0); + if (rc) + goto sem; + +out: + return rc; +sem: + sem_destroy(&ival_sem); + DFUSE_TRA_DOWN(&ival_data); + return rc; +} + +/* Start the thread. Not called until after fuse is mounted */ +int +ival_thread_start(struct dfuse_info *dfuse_info) +{ + int rc; + + ival_data.session = dfuse_info->di_session; + + rc = pthread_create(&ival_thread, NULL, ival_thread_fn, NULL); + if (rc != 0) + goto out; + pthread_setname_np(ival_thread, "invalidator"); + +out: + return rc; +} + +/* Stop thread, remove all inodes from the invalidation queues and teardown all data structures + * May be called without thread_start() having been called. + */ +void +ival_thread_stop() +{ + struct dfuse_time_entry *dte, *dtep; + + ival_stop = true; + /* Stop and drain evict queues */ + sem_post(&ival_sem); + + if (ival_thread) + pthread_join(ival_thread, NULL); + ival_thread = 0; + + /* Walk the list, oldest first */ + d_list_for_each_entry_safe(dte, dtep, &ival_data.time_entry_list, dte_list) { + struct dfuse_inode_entry *inode, *inodep; + + d_list_for_each_entry_safe(inode, inodep, &dte->inode_list, ie_evict_entry) + d_list_del_init(&inode->ie_evict_entry); + + d_list_del(&dte->dte_list); + D_FREE(dte); + } + DFUSE_TRA_DOWN(&ival_data); +} + +/* Update the invalidation time for an inode */ +int +ival_update_inode(struct dfuse_inode_entry *inode, double timeout) +{ + struct dfuse_time_entry *dte; + struct timespec now; + bool wake = false; + + if (S_ISDIR(inode->ie_stat.st_mode)) + timeout += INVAL_DIRECTORY_GRACE; + else + timeout += INVAL_FILE_GRACE; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + + D_MUTEX_LOCK(&ival_lock); + inode->ie_dentry_last_update = now; + + /* Walk each timeout value + * These go longest to shortest so walk the list until one is found where the value is + * lower than we're looking for. + */ + d_list_for_each_entry(dte, &ival_data.time_entry_list, dte_list) { + if (dte->time > timeout) + continue; + + if (d_list_empty(&dte->inode_list)) + wake = true; + + DFUSE_TRA_DEBUG(inode, "timeout %lf wake:" DF_BOOL " %#lx " DF_DE, timeout, + DP_BOOL(wake), inode->ie_parent, DP_DE(inode->ie_name)); + + d_list_move_tail(&inode->ie_evict_entry, &dte->inode_list); + break; + } + + D_MUTEX_UNLOCK(&ival_lock); + + if (wake) + sem_post(&ival_sem); + + return 0; +} + +/* Ensure there's a timeout list for the given value. + * Check if one exists already, and if it does not the insert it into the right location. + * + * Returns a system error code. + */ +static int +ival_bucket_add_value(double timeout) +{ + struct dfuse_time_entry *dte; + double lower = -1; + int rc = 0; + + DFUSE_TRA_INFO(&ival_data, "Setting up timeout queue for %lf", timeout); + + D_MUTEX_LOCK(&ival_lock); + + /* Walk smallest to largest */ + d_list_for_each_entry_reverse(dte, &ival_data.time_entry_list, dte_list) { + if (dte->time == timeout) + D_GOTO(out, rc = -DER_SUCCESS); + if (dte->time < timeout) + lower = dte->time; + if (dte->time > timeout) + break; + } + + if (lower == -1) { + rc = ival_bucket_add(&ival_data.time_entry_list, timeout); + goto out; + } + + d_list_for_each_entry_reverse(dte, &ival_data.time_entry_list, dte_list) { + if (dte->time < lower) + continue; + + rc = ival_bucket_add(&dte->dte_list, timeout); + break; + } + +out: + D_MUTEX_UNLOCK(&ival_lock); + + return rc; +} + +/* Ensure the correct buckets exist for a attached container */ +int +ival_add_cont_buckets(struct dfuse_cont *dfc) +{ + int rc, rc2; + + rc = ival_bucket_add_value(dfc->dfc_dentry_timeout + INVAL_FILE_GRACE); + rc2 = ival_bucket_add_value(dfc->dfc_dentry_dir_timeout + INVAL_DIRECTORY_GRACE); + + return rc ? rc : rc2; +} + +/* Called from ie_close() to remove inode from any possible list */ +void +ival_drop_inode(struct dfuse_inode_entry *ie) +{ + D_MUTEX_LOCK(&ival_lock); + if (!d_list_empty(&ie->ie_evict_entry)) + d_list_del(&ie->ie_evict_entry); + D_MUTEX_UNLOCK(&ival_lock); +} diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index f47e88986cdf..9e1df7a61189 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -292,12 +292,8 @@ dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, out_free: dfuse_ie_free(dfuse_info, ie); out: - if (rc == ENOENT && parent->ie_dfs->dfc_ndentry_timeout > 0) { - struct fuse_entry_param entry = {}; - - entry.entry_timeout = parent->ie_dfs->dfc_ndentry_timeout; - DFUSE_REPLY_ENTRY(parent, req, entry); - } else { + if (rc == ENOENT && parent->ie_dfs->dfc_ndentry_timeout > 0) + DFUSE_REPLY_NO_ENTRY(parent, req, parent->ie_dfs->dfc_ndentry_timeout); + else DFUSE_REPLY_ERR_RAW(parent, req, rc); - } } diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index f5b07f9c9702..fd6a71169dec 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -225,7 +225,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) rc = fuse_lowlevel_notify_inval_entry(dfuse_info->di_session, ie->ie_parent, ie->ie_name, strnlen(ie->ie_name, NAME_MAX)); - if (rc != 0) + if (rc != 0 && rc != -ENOENT) DHS_ERROR(ie, -rc, "inval_entry() error"); dfuse_inode_decref(dfuse_info, ie); } diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index ed24796878f9..091d4102c1ca 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -82,7 +82,7 @@ dfuse_cb_releasedir(fuse_req_t req, struct dfuse_inode_entry *ino, struct fuse_f rc = fuse_lowlevel_notify_inval_entry(dfuse_info->di_session, ie->ie_parent, ie->ie_name, strnlen(ie->ie_name, NAME_MAX)); - if (rc != 0) + if (rc != 0 && rc != -ENOENT) DHS_ERROR(ie, -rc, "inval_entry() error"); dfuse_inode_decref(dfuse_info, ie); } diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 80ff59264420..b50caad76efc 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2022-2023 Intel Corporation. + * (C) Copyright 2022-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -61,7 +61,7 @@ #define MAX_DAOS_MT (8) #define READ_DIR_BATCH_SIZE (96) -#define MAX_FD_DUP2ED (8) +#define MAX_FD_DUP2ED (16) #define MAX_MMAP_BLOCK (64) @@ -78,6 +78,14 @@ #define MAX_EQ 64 +/* the default min fd that will be used by DAOS */ +#define DAOS_MIN_FD 10 + +/* the number of low fd reserved */ +static uint16_t low_fd_count; +/* the list of low fd reserved */ +static int low_fd_list[DAOS_MIN_FD]; + /* In case of fork(), only the parent process could destroy daos env. */ static bool context_reset; static __thread daos_handle_t td_eqh; @@ -126,7 +134,7 @@ static _Atomic uint32_t daos_init_cnt; static bool report; static long int page_size; -static bool daos_inited; +static _Atomic bool daos_inited; static bool daos_debug_inited; static int num_dfs; static struct dfs_mt dfs_list[MAX_DAOS_MT]; @@ -185,7 +193,6 @@ struct mmap_obj { struct fd_dup2 { int fd_src, fd_dest; - bool dest_closed; }; /* Add the data structure for statx_timestamp and statx @@ -233,7 +240,7 @@ struct statx { #endif /* working dir of current process */ -static char cur_dir[DFS_MAX_PATH] = ""; +static char cur_dir[DFS_MAX_PATH + 1] = ""; static bool segv_handler_inited; /* Old segv handler */ struct sigaction old_segv; @@ -241,6 +248,7 @@ struct sigaction old_segv; /* the flag to indicate whether initlization is finished or not */ static bool hook_enabled; static bool hook_enabled_bak; +static pthread_mutex_t lock_reserve_fd; static pthread_mutex_t lock_dfs; static pthread_mutex_t lock_fd; static pthread_mutex_t lock_dirfd; @@ -403,6 +411,8 @@ static ssize_t (*next_pwrite)(int fd, const void *buf, size_t size, off_t offset static off_t (*libc_lseek)(int fd, off_t offset, int whence); static off_t (*pthread_lseek)(int fd, off_t offset, int whence); +static int new_fxstat(int vers, int fd, struct stat *buf); + static int (*next_fxstat)(int vers, int fd, struct stat *buf); static int (*next_fstat)(int fd, struct stat *buf); @@ -532,6 +542,8 @@ remove_dot_dot(char path[], int *len); static int remove_dot_and_cleanup(char szPath[], int len); +/* reference count of fake fd duplicated by real fd with dup2() */ +static int dup_ref_count[MAX_OPENED_FILE]; static struct file_obj *file_list[MAX_OPENED_FILE]; static struct dir_obj *dir_list[MAX_OPENED_DIR]; static struct mmap_obj mmap_list[MAX_MMAP_BLOCK]; @@ -548,7 +560,7 @@ find_next_available_dirfd(struct dir_obj *obj, int *new_fd); static int find_next_available_map(int *idx); static void -free_fd(int idx); +free_fd(int idx, bool closing_dup_fd); static void free_dirfd(int idx); static void @@ -600,13 +612,14 @@ query_dfs_mount(const char *path) static int discover_daos_mount_with_env(void) { - int idx, len_fs_root, rc; - char *fs_root = NULL; - char *pool = NULL; - char *container = NULL; + int idx, rc; + char *fs_root = NULL; + char *pool = NULL; + char *container = NULL; + size_t len_fs_root, len_pool, len_container; /* Add the mount if env DAOS_MOUNT_POINT is set. */ - fs_root = getenv("DAOS_MOUNT_POINT"); + rc = d_agetenv_str(&fs_root, "DAOS_MOUNT_POINT"); if (fs_root == NULL) /* env DAOS_MOUNT_POINT is undefined, return success (0) */ D_GOTO(out, rc = 0); @@ -633,31 +646,56 @@ discover_daos_mount_with_env(void) D_GOTO(out, rc = ENAMETOOLONG); } - pool = getenv("DAOS_POOL"); + d_agetenv_str(&pool, "DAOS_POOL"); if (pool == NULL) { D_FATAL("DAOS_POOL is not set.\n"); D_GOTO(out, rc = EINVAL); } - container = getenv("DAOS_CONTAINER"); + len_pool = strnlen(pool, DAOS_PROP_MAX_LABEL_BUF_LEN); + if (len_pool >= DAOS_PROP_MAX_LABEL_BUF_LEN) { + D_FATAL("DAOS_POOL is too long.\n"); + D_GOTO(out, rc = ENAMETOOLONG); + } + + rc = d_agetenv_str(&container, "DAOS_CONTAINER"); if (container == NULL) { D_FATAL("DAOS_CONTAINER is not set.\n"); D_GOTO(out, rc = EINVAL); } + len_container = strnlen(container, DAOS_PROP_MAX_LABEL_BUF_LEN); + if (len_container >= DAOS_PROP_MAX_LABEL_BUF_LEN) { + D_FATAL("DAOS_CONTAINER is too long.\n"); + D_GOTO(out, rc = ENAMETOOLONG); + } + D_STRNDUP(dfs_list[num_dfs].fs_root, fs_root, len_fs_root); if (dfs_list[num_dfs].fs_root == NULL) D_GOTO(out, rc = ENOMEM); - dfs_list[num_dfs].pool = pool; - dfs_list[num_dfs].cont = container; + D_STRNDUP(dfs_list[num_dfs].pool, pool, len_pool); + if (dfs_list[num_dfs].pool == NULL) + D_GOTO(free_fs_root, rc = ENOMEM); + + D_STRNDUP(dfs_list[num_dfs].cont, container, len_container); + if (dfs_list[num_dfs].cont == NULL) + D_GOTO(free_pool, rc = ENOMEM); + dfs_list[num_dfs].dfs_dir_hash = NULL; - dfs_list[num_dfs].len_fs_root = len_fs_root; + dfs_list[num_dfs].len_fs_root = (int)len_fs_root; atomic_init(&dfs_list[num_dfs].inited, 0); num_dfs++; - rc = 0; + D_GOTO(out, rc = 0); +free_pool: + D_FREE(dfs_list[num_dfs].pool); +free_fs_root: + D_FREE(dfs_list[num_dfs].fs_root); out: + d_freeenv_str(&container); + d_freeenv_str(&pool); + d_freeenv_str(&fs_root); return rc; } @@ -899,7 +937,7 @@ child_hdlr(void) int rc; /* daos is not initialized yet */ - if (!daos_inited) + if (atomic_load_relaxed(&daos_inited) == false) return; daos_eq_lib_reset_after_fork(); @@ -913,6 +951,58 @@ child_hdlr(void) context_reset = true; } +/* only free the reserved low fds when application exits or encounters error */ +static void +free_reserved_low_fd(void) +{ + int i; + + for (i = 0; i < low_fd_count; i++) + libc_close(low_fd_list[i]); + low_fd_count = 0; +} + +/* some applications especially bash scripts use specific low fds directly. + * It would be safer to avoid using such low fds (fd < DAOS_MIN_FD) in daos. + * We consume such low fds before any daos calls and close them only when + * application exits or encounters error. + */ + +static int +consume_low_fd(void) +{ + int rc = 0; + + if (atomic_load_relaxed(&daos_inited) == true) + return 0; + + D_MUTEX_LOCK(&lock_reserve_fd); + low_fd_count = 0; + low_fd_list[low_fd_count] = libc_open("/", O_PATH | O_DIRECTORY); + while (1) { + if (low_fd_list[low_fd_count] < 0) { + DS_ERROR(errno, "failed to reserve a low fd"); + goto err; + } else if (low_fd_list[low_fd_count] >= DAOS_MIN_FD) { + libc_close(low_fd_list[low_fd_count]); + break; + } else { + low_fd_count++; + } + low_fd_list[low_fd_count] = libc_open("/", O_RDONLY); + } + + D_MUTEX_UNLOCK(&lock_reserve_fd); + return rc; + +err: + rc = errno; + free_reserved_low_fd(); + D_MUTEX_UNLOCK(&lock_reserve_fd); + + return rc; +} + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -962,7 +1052,7 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i if (strncmp(szInput, ".", 2) == 0) { /* special case for current work directory */ - pt_end = stpncpy(full_path_parse, cur_dir, DFS_MAX_PATH); + pt_end = stpncpy(full_path_parse, cur_dir, DFS_MAX_PATH + 1); len = (int)(pt_end - full_path_parse); if (len >= DFS_MAX_PATH) { D_DEBUG(DB_ANY, "full_path_parse[] is not large enough: %d (%s)\n", @@ -1005,24 +1095,37 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i if (idx_dfs >= 0) { /* trying to avoid lock as much as possible */ - if (!daos_inited) { + if (atomic_load_relaxed(&daos_inited) == false) { /* daos_init() is expensive to call. We call it only when necessary. */ + + rc = consume_low_fd(); + if (rc) { + DS_ERROR(rc, "consume_low_fd() failed"); + *is_target_path = 0; + goto out_normal; + } + rc = daos_init(); if (rc) { DL_ERROR(rc, "daos_init() failed"); *is_target_path = 0; goto out_normal; } + if (eq_count_max) { - rc = daos_eq_create(&td_eqh); - if (rc) - DL_WARN(rc, "daos_eq_create() failed"); - main_eqh = td_eqh; - rc = pthread_atfork(NULL, NULL, &child_hdlr); - D_ASSERT(rc == 0); + D_MUTEX_LOCK(&lock_eqh); + if (daos_handle_is_inval(main_eqh)) { + rc = daos_eq_create(&td_eqh); + if (rc) + DL_WARN(rc, "daos_eq_create() failed"); + main_eqh = td_eqh; + rc = pthread_atfork(NULL, NULL, &child_hdlr); + D_ASSERT(rc == 0); + } + D_MUTEX_UNLOCK(&lock_eqh); } - daos_inited = true; + atomic_store_relaxed(&daos_inited, true); atomic_fetch_add_relaxed(&daos_init_cnt, 1); } @@ -1100,6 +1203,7 @@ query_path(const char *szInput, int *is_target_path, dfs_obj_t **parent, char *i D_GOTO(out_err, rc); } } else { + strncpy(*full_path, full_path_parse, len + 1); *is_target_path = 0; item_name[0] = '\0'; } @@ -1296,6 +1400,7 @@ find_next_available_fd(struct file_obj *obj, int *new_fd) new_obj->ref_count++; file_list[idx] = new_obj; } + dup_ref_count[idx] = 0; if (next_free_fd > last_fd) last_fd = next_free_fd; next_free_fd = -1; @@ -1315,6 +1420,24 @@ find_next_available_fd(struct file_obj *obj, int *new_fd) return 0; } +static void +inc_dup_ref_count(int fd) +{ + D_MUTEX_LOCK(&lock_fd); + dup_ref_count[fd - FD_FILE_BASE]++; + file_list[fd - FD_FILE_BASE]->ref_count++; + D_MUTEX_UNLOCK(&lock_fd); +} + +static void +dec_dup_ref_count(int fd) +{ + D_MUTEX_LOCK(&lock_fd); + dup_ref_count[fd - FD_FILE_BASE]--; + file_list[fd - FD_FILE_BASE]->ref_count--; + D_MUTEX_UNLOCK(&lock_fd); +} + static int find_next_available_dirfd(struct dir_obj *obj, int *new_dir_fd) { @@ -1399,7 +1522,7 @@ find_next_available_map(int *idx) /* May need to support duplicated fd as duplicated dirfd too. */ static void -free_fd(int idx) +free_fd(int idx, bool closing_dup_fd) { int i, rc; struct file_obj *saved_obj = NULL; @@ -1412,9 +1535,15 @@ free_fd(int idx) return; } + if (closing_dup_fd) + dup_ref_count[idx]--; file_list[idx]->ref_count--; if (file_list[idx]->ref_count == 0) saved_obj = file_list[idx]; + if (dup_ref_count[idx] > 0) { + D_MUTEX_UNLOCK(&lock_fd); + return; + } file_list[idx] = NULL; if (idx < next_free_fd) @@ -1506,7 +1635,7 @@ free_map(int idx) mmap_list[idx].addr = NULL; /* Need to call free_fd(). */ if (file_list[mmap_list[idx].fd - FD_FILE_BASE]->idx_mmap >= MAX_MMAP_BLOCK) - free_fd(mmap_list[idx].fd - FD_FILE_BASE); + free_fd(mmap_list[idx].fd - FD_FILE_BASE, false); mmap_list[idx].fd = -1; if (idx < next_free_map) @@ -1547,46 +1676,23 @@ get_fd_redirected(int fd) return fd_ret; } -/* This fd is a fake fd. There exists a associated kernel fd with dup2. - * Need to check whether fd is in fd_dup2_list[], set dest_closed true - * if yes. Otherwise, close the fake fd. - */ -static void -close_dup_fd_dest_fakefd(int fd) -{ - int i; - - if (fd < FD_FILE_BASE) - return; - - D_MUTEX_LOCK(&lock_fd_dup2ed); - if (num_fd_dup2ed > 0) { - for (i = 0; i < MAX_FD_DUP2ED; i++) { - if (fd_dup2_list[i].fd_dest == fd) { - fd_dup2_list[i].dest_closed = true; - D_MUTEX_UNLOCK(&lock_fd_dup2ed); - return; - } - } - } - D_MUTEX_UNLOCK(&lock_fd_dup2ed); - - free_fd(fd - FD_FILE_BASE); -} - /* This fd is a fd from kernel and it is associated with a fake fd. - * Need to 1) close(fd) 2) remove the entry in fd_dup2_list[] 3) close - * the fake fd if dest_closed is true. + * Need to 1) close(fd) 2) remove the entry in fd_dup2_list[] 3) decrease + * the dup reference count of the fake fd. */ + static int -close_dup_fd_src(int (*next_close)(int fd), int fd) +close_dup_fd(int (*next_close)(int fd), int fd, bool close_fd) { int i, rc, idx_dup = -1, fd_dest = -1; - /* close the fd from kernel */ - rc = next_close(fd); - if (rc != 0) - return (-1); + if (close_fd) { + /* close the fd from kernel */ + assert(fd < FD_FILE_BASE); + rc = next_close(fd); + if (rc != 0) + return (-1); + } /* remove the fd_dup entry */ D_MUTEX_LOCK(&lock_fd_dup2ed); @@ -1594,12 +1700,10 @@ close_dup_fd_src(int (*next_close)(int fd), int fd) for (i = 0; i < MAX_FD_DUP2ED; i++) { if (fd_dup2_list[i].fd_src == fd) { idx_dup = i; - if (fd_dup2_list[i].dest_closed) - fd_dest = fd_dup2_list[i].fd_dest; + fd_dest = fd_dup2_list[i].fd_dest; /* clear the value to free */ fd_dup2_list[i].fd_src = -1; fd_dup2_list[i].fd_dest = -1; - fd_dup2_list[i].dest_closed = false; num_fd_dup2ed--; break; } @@ -1613,8 +1717,7 @@ close_dup_fd_src(int (*next_close)(int fd), int fd) errno = EINVAL; return (-1); } - if (fd_dest > 0) - free_fd(fd_dest - FD_FILE_BASE); + free_fd(fd_dest - FD_FILE_BASE, true); return 0; } @@ -1628,7 +1731,6 @@ init_fd_dup2_list(void) for (i = 0; i < MAX_FD_DUP2ED; i++) { fd_dup2_list[i].fd_src = -1; fd_dup2_list[i].fd_dest = -1; - fd_dup2_list[i].dest_closed = false; } D_MUTEX_UNLOCK(&lock_fd_dup2ed); } @@ -1638,6 +1740,9 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) { int i; + /* increase reference count of the fake fd */ + inc_dup_ref_count(fd_dest); + /* Not many applications use dup2(). Normally the number of fd duped is small. */ D_MUTEX_LOCK(&lock_fd_dup2ed); if (num_fd_dup2ed < MAX_FD_DUP2ED) { @@ -1645,7 +1750,6 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) if (fd_dup2_list[i].fd_src == -1) { fd_dup2_list[i].fd_src = fd_src; fd_dup2_list[i].fd_dest = fd_dest; - fd_dup2_list[i].dest_closed = false; num_fd_dup2ed++; D_MUTEX_UNLOCK(&lock_fd_dup2ed); return i; @@ -1654,6 +1758,8 @@ allocate_dup2ed_fd(const int fd_src, const int fd_dest) } D_MUTEX_UNLOCK(&lock_fd_dup2ed); + /* decrease dup reference count in error */ + dec_dup_ref_count(fd_dest); DS_ERROR(EMFILE, "fd_dup2_list[] is out of space"); errno = EMFILE; return (-1); @@ -1695,7 +1801,7 @@ close_all_duped_fd(void) /* Only the main thread will call this function in the destruction phase */ for (i = 0; i < MAX_FD_DUP2ED; i++) { if (fd_dup2_list[i].fd_src >= 0) - close_dup_fd_src(libc_close, fd_dup2_list[i].fd_src); + close_dup_fd(libc_close, fd_dup2_list[i].fd_src, true); } num_fd_dup2ed = 0; } @@ -1765,7 +1871,7 @@ check_path_with_dirfd(int dirfd, char **full_path_out, const char *rel_path, int free(*full_path_out); *full_path_out = NULL; } - DS_ERROR(errno, "readlink() failed"); + D_DEBUG(DB_ANY, "readlink() failed: %d (%s)\n", errno, strerror(errno)); return (-1); } @@ -1780,8 +1886,8 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char mode_t mode_query = 0, mode_parent = 0; struct dfs_mt *dfs_mt; char item_name[DFS_MAX_NAME]; - char *parent_dir = NULL; - char *full_path = NULL; + char *parent_dir = NULL; + char *full_path = NULL; if (pathname == NULL) { errno = EFAULT; @@ -1839,14 +1945,15 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* file/dir should be handled by DFS */ if (oflags & O_CREAT) { - rc = dfs_open(dfs_mt->dfs, parent, item_name, mode | S_IFREG, oflags, 0, 0, NULL, - &dfs_obj); + rc = dfs_open(dfs_mt->dfs, parent, item_name, mode | S_IFREG, oflags & (~O_APPEND), + 0, 0, NULL, &dfs_obj); mode_query = S_IFREG; } else if (!parent && (strncmp(item_name, "/", 2) == 0)) { - rc = dfs_lookup(dfs_mt->dfs, "/", oflags, &dfs_obj, &mode_query, NULL); + rc = + dfs_lookup(dfs_mt->dfs, "/", oflags & (~O_APPEND), &dfs_obj, &mode_query, NULL); } else { - rc = dfs_lookup_rel(dfs_mt->dfs, parent, item_name, oflags, &dfs_obj, &mode_query, - NULL); + rc = dfs_lookup_rel(dfs_mt->dfs, parent, item_name, oflags & (~O_APPEND), &dfs_obj, + &mode_query, NULL); } if (rc) @@ -1908,6 +2015,15 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char FREE(parent_dir); + if (oflags & O_APPEND) { + struct stat fstat; + + rc = new_fxstat(1, idx_fd + FD_FILE_BASE, &fstat); + if (rc != 0) + return (-1); + file_list[idx_fd]->offset = fstat.st_size; + } + return (idx_fd + FD_FILE_BASE); org_func: @@ -2011,10 +2127,10 @@ new_close_common(int (*next_close)(int fd), int fd) } else if (fd_directed >= FD_FILE_BASE) { /* This fd is a kernel fd. There was a duplicate fd created. */ if (fd < FD_FILE_BASE) - return close_dup_fd_src(next_close, fd); + return close_dup_fd(next_close, fd, true); /* This fd is a fake fd. There exists a associated kernel fd with dup2. */ - close_dup_fd_dest_fakefd(fd); + free_fd(fd - FD_FILE_BASE, false); return 0; } @@ -2498,6 +2614,17 @@ new_fxstatat(int ver, int dirfd, const char *path, struct stat *stat_buf, int fl return new_xstat(1, path, stat_buf); } + if (dirfd >= FD_FILE_BASE && dirfd < FD_DIR_BASE) { + if (path[0] == 0 && flags & AT_EMPTY_PATH) + /* same as fstat for a file. May need further work to handle flags */ + return new_fxstat(ver, dirfd, stat_buf); + else if (path[0] == 0) + error = ENOENT; + else + error = ENOTDIR; + goto out_err; + } + idx_dfs = check_path_with_dirfd(dirfd, &full_path, path, &error); if (error) goto out_err; @@ -2540,6 +2667,17 @@ new_fstatat(int dirfd, const char *__restrict path, struct stat *__restrict stat return new_xstat(1, path, stat_buf); } + if (dirfd >= FD_FILE_BASE && dirfd < FD_DIR_BASE) { + if (path[0] == 0 && flags & AT_EMPTY_PATH) + /* same as fstat for a file. May need further work to handle flags */ + return fstat(dirfd, stat_buf); + else if (path[0] == 0) + error = ENOENT; + else + error = ENOTDIR; + goto out_err; + } + idx_dfs = check_path_with_dirfd(dirfd, &full_path, path, &error); if (error) goto out_err; @@ -3955,6 +4093,8 @@ getcwd(char *buf, size_t size) if (buf == NULL) { size_t len; + if (size == 0) + size = PATH_MAX; len = strnlen(cur_dir, size); if (len >= size) { errno = ERANGE; @@ -4075,14 +4215,12 @@ faccessat(int dirfd, const char *path, int mode, int flags) int chdir(const char *path) { - int is_target_path, rc, len_str, errno_save; + int is_target_path, rc, len_str; dfs_obj_t *parent; - struct stat stat_buf; struct dfs_mt *dfs_mt; char item_name[DFS_MAX_NAME]; - char *parent_dir = NULL; - char *full_path = NULL; - bool is_root; + char *parent_dir = NULL; + char *full_path = NULL; if (next_chdir == NULL) { next_chdir = dlsym(RTLD_NEXT, "chdir"); @@ -4095,36 +4233,22 @@ chdir(const char *path) &full_path, &dfs_mt); if (rc) D_GOTO(out_err, rc); - if (!is_target_path) { - FREE(parent_dir); - rc = next_chdir(path); - errno_save = errno; - if (rc == 0) - update_cwd(); - errno = errno_save; - return rc; - } - if (!parent && (strncmp(item_name, "/", 2) == 0)) { - is_root = true; - rc = dfs_stat(dfs_mt->dfs, NULL, NULL, &stat_buf); - } else { - is_root = false; - rc = dfs_stat(dfs_mt->dfs, parent, item_name, &stat_buf); - } + rc = next_chdir(path); if (rc) - D_GOTO(out_err, rc); - if (!S_ISDIR(stat_buf.st_mode)) { - D_DEBUG(DB_ANY, "%s is not a directory: %d (%s)\n", path, ENOTDIR, - strerror(ENOTDIR)); - D_GOTO(out_err, rc = ENOTDIR); + D_GOTO(out_err, rc = errno); + + if (!is_target_path) { + strncpy(cur_dir, full_path, DFS_MAX_PATH); + if (cur_dir[DFS_MAX_PATH - 1] != 0) { + D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, + strerror(ENAMETOOLONG)); + D_GOTO(out_err, rc = ENAMETOOLONG); + } + D_GOTO(out, rc); } - if (is_root) - rc = dfs_access(dfs_mt->dfs, NULL, NULL, X_OK); - else - rc = dfs_access(dfs_mt->dfs, parent, item_name, X_OK); - if (rc) - D_GOTO(out_err, rc); + + /* assuming the path exists and it is backed by dfuse */ len_str = snprintf(cur_dir, DFS_MAX_PATH, "%s%s", dfs_mt->fs_root, full_path); if (len_str >= DFS_MAX_PATH) { D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, @@ -4132,6 +4256,7 @@ chdir(const char *path) D_GOTO(out_err, rc = ENAMETOOLONG); } +out: FREE(parent_dir); return 0; @@ -4144,6 +4269,7 @@ chdir(const char *path) int fchdir(int dirfd) { + int rc; char *pt_end = NULL; if (next_fchdir == NULL) { @@ -4156,6 +4282,15 @@ fchdir(int dirfd) if (dirfd < FD_DIR_BASE) return next_fchdir(dirfd); + /* assume dfuse is running. call chdir() to update cwd. */ + if (next_chdir == NULL) { + next_chdir = dlsym(RTLD_NEXT, "chdir"); + D_ASSERT(next_chdir != NULL); + } + rc = next_chdir(dir_list[dirfd - FD_DIR_BASE]->path); + if (rc) + return rc; + pt_end = stpncpy(cur_dir, dir_list[dirfd - FD_DIR_BASE]->path, DFS_MAX_PATH - 1); if ((long int)(pt_end - cur_dir) >= DFS_MAX_PATH - 1) { D_DEBUG(DB_ANY, "path is too long: %d (%s)\n", ENAMETOOLONG, @@ -4979,7 +5114,7 @@ dup(int oldfd) int dup2(int oldfd, int newfd) { - int fd, fd_directed, idx, rc, errno_save; + int fd, oldfd_directed, newfd_directed, fd_directed, idx, rc, errno_save; /* Need more work later. */ if (next_dup2 == NULL) { @@ -4995,16 +5130,27 @@ dup2(int oldfd, int newfd) else return newfd; } - if ((oldfd < FD_FILE_BASE) && (newfd < FD_FILE_BASE)) + oldfd_directed = query_fd_forward_dest(oldfd); + newfd_directed = query_fd_forward_dest(newfd); + if ((oldfd_directed < FD_FILE_BASE) && (oldfd < FD_FILE_BASE) && + (newfd_directed < FD_FILE_BASE) && (newfd < FD_FILE_BASE)) return next_dup2(oldfd, newfd); + if (oldfd_directed >= FD_FILE_BASE && oldfd < FD_FILE_BASE) + oldfd = oldfd_directed; + if (newfd >= FD_FILE_BASE) { DS_ERROR(ENOTSUP, "unimplemented yet for newfd >= FD_FILE_BASE"); errno = ENOTSUP; return -1; } fd_directed = query_fd_forward_dest(newfd); - if (fd_directed >= FD_FILE_BASE) { + if (fd_directed >= FD_FILE_BASE && newfd < FD_FILE_BASE && oldfd_directed < FD_FILE_BASE && + oldfd < FD_FILE_BASE) { + /* need to remove newfd from forward list and decrease refcount in file_list[] */ + close_dup_fd(libc_close, newfd, false); + return next_dup2(oldfd, newfd); + } else if (fd_directed >= FD_FILE_BASE) { DS_ERROR(ENOTSUP, "unimplemented yet for fd_directed >= FD_FILE_BASE"); errno = ENOTSUP; return -1; @@ -5015,13 +5161,22 @@ dup2(int oldfd, int newfd) else fd_directed = query_fd_forward_dest(oldfd); if (fd_directed >= FD_FILE_BASE) { - rc = close(newfd); - if (rc != 0 && errno != EBADF) - return -1; - fd = allocate_a_fd_from_kernel(); + int fd_tmp; + + fd_tmp = allocate_a_fd_from_kernel(); + if (fd_tmp < 0) { + /* failed to allocate an fd from kernel */ + errno_save = errno; + DS_ERROR(errno_save, "failed to get a fd from kernel"); + errno = errno_save; + return (-1); + } + /* rely on dup2() to get the desired fd */ + fd = next_dup2(fd_tmp, newfd); if (fd < 0) { /* failed to allocate an fd from kernel */ errno_save = errno; + close(fd_tmp); DS_ERROR(errno_save, "failed to get a fd from kernel"); errno = errno_save; return (-1); @@ -5031,6 +5186,9 @@ dup2(int oldfd, int newfd) errno = EBUSY; return (-1); } + rc = libc_close(fd_tmp); + if (rc != 0) + return -1; idx = allocate_dup2ed_fd(fd, fd_directed); if (idx >= 0) return fd; @@ -5515,11 +5673,12 @@ init_myhook(void) else daos_debug_inited = true; - env_log = getenv("D_IL_REPORT"); + d_agetenv_str(&env_log, "D_IL_REPORT"); if (env_log) { report = true; if (strncmp(env_log, "0", 2) == 0 || strncasecmp(env_log, "false", 6) == 0) report = false; + d_freeenv_str(&env_log); } /* Find dfuse mounts from /proc/mounts */ @@ -5542,6 +5701,10 @@ init_myhook(void) } update_cwd(); + rc = D_MUTEX_INIT(&lock_reserve_fd, NULL); + if (rc) + return; + rc = D_MUTEX_INIT(&lock_dfs, NULL); if (rc) return; @@ -5673,7 +5836,7 @@ close_all_fd(void) for (i = 0; i <= last_fd; i++) { if (file_list[i]) - free_fd(i); + free_fd(i, false); } } @@ -5723,6 +5886,7 @@ finalize_myhook(void) finalize_dfs(); D_MUTEX_DESTROY(&lock_eqh); + D_MUTEX_DESTROY(&lock_reserve_fd); D_MUTEX_DESTROY(&lock_dfs); D_MUTEX_DESTROY(&lock_dirfd); D_MUTEX_DESTROY(&lock_fd); @@ -5804,6 +5968,8 @@ finalize_dfs(void) for (i = 0; i < num_dfs; i++) { if (dfs_list[i].dfs_dir_hash == NULL) { D_FREE(dfs_list[i].fs_root); + D_FREE(dfs_list[i].pool); + D_FREE(dfs_list[i].cont); continue; } @@ -5835,11 +6001,14 @@ finalize_dfs(void) continue; } D_FREE(dfs_list[i].fs_root); + D_FREE(dfs_list[i].pool); + D_FREE(dfs_list[i].cont); } - if (daos_inited) { + if (atomic_load_relaxed(&daos_inited)) { uint32_t init_cnt, j; + free_reserved_low_fd(); init_cnt = atomic_load_relaxed(&daos_init_cnt); for (j = 0; j < init_cnt; j++) { rc = daos_fini(); diff --git a/src/client/pydaos/pydaos_shim.c b/src/client/pydaos/pydaos_shim.c index 81bbbc476978..22e671fbcdc5 100644 --- a/src/client/pydaos/pydaos_shim.c +++ b/src/client/pydaos/pydaos_shim.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -84,8 +84,8 @@ do { \ } \ } while (0) -static daos_handle_t glob_eq; -static int use_glob_eq; +static daos_handle_t glob_eq; +static bool use_glob_eq; /** * Implementations of baseline shim functions @@ -95,18 +95,17 @@ static PyObject * __shim_handle__daos_init(PyObject *self, PyObject *args) { int rc; - int ret; - char *override; rc = daos_init(); if ((rc == 0) && (use_glob_eq == 0)) { - override = getenv("PYDAOS_GLOB_EQ"); - if ((override == NULL) || strcmp(override, "0")) { - use_glob_eq = 1; + d_getenv_bool("PYDAOS_GLOB_EQ", &use_glob_eq); + if (use_glob_eq) { + int ret; + ret = daos_eq_create(&glob_eq); if (ret) { - D_ERROR("Failed to create global eq, "DF_RC"\n", DP_RC(ret)); - use_glob_eq = 0; + DL_ERROR(ret, "Failed to create global eq"); + use_glob_eq = false; } } } @@ -123,7 +122,7 @@ __shim_handle__daos_fini(PyObject *self, PyObject *args) rc = daos_eq_destroy(glob_eq, DAOS_EQ_DESTROY_FORCE); if (rc) D_ERROR("Failed to destroy global eq, "DF_RC"\n", DP_RC(rc)); - use_glob_eq = 0; + use_glob_eq = false; } rc = daos_fini(); diff --git a/src/client/setup.py b/src/client/setup.py index c9e52afa8de5..14155c75f459 100644 --- a/src/client/setup.py +++ b/src/client/setup.py @@ -9,35 +9,21 @@ python3 setup.py install -If run from within a compiled DAOS source tree this it will detect the -install path automatically, otherwise it'll use the defaults. +This can be run from either the installed daos packages or from a install directory, however python +requires write access to the directory to install so if installing from rpms then a copy may have to +be made before install. """ -import json import os from setuptools import Extension, find_packages, setup - -def load_conf(): - """Load the build config file""" - file_self = os.path.dirname(os.path.abspath(__file__)) - while file_self != "/": - new_file = os.path.join(file_self, ".build_vars.json") - if os.path.exists(new_file): - with open(new_file, "r", encoding="utf-8") as ofh: - return json.load(ofh) - - file_self = os.path.dirname(file_self) - return None - - -conf = load_conf() - args = {"sources": ["pydaos/pydaos_shim.c"], "libraries": ["daos", "duns"]} -if conf: - args["include_dirs"] = [os.path.join(conf["PREFIX"], "include")] - args["library_dirs"] = [os.path.join(conf["PREFIX"], "lib64")] +prefix_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "..") + +if os.path.exists(os.path.join(prefix_dir, "include", "daos.h")): + args["include_dirs"] = [os.path.join(prefix_dir, "include")] + args["library_dirs"] = [os.path.join(prefix_dir, "lib64")] args["runtime_library_dirs"] = args["library_dirs"] diff --git a/src/common/btree.c b/src/common/btree.c index b2db9424e468..6d39735cf531 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -16,6 +16,8 @@ #include #include +#define BTR_EXT_FEAT_MASK (BTR_FEAT_MASK ^ BTR_FEAT_EMBEDDED) + /** * Tree node types. * NB: a node can be both root and leaf. @@ -83,6 +85,25 @@ struct btr_iterator { unsigned int it_collisions; }; +enum { + /** Entry is not embedded */ + BTR_EMBEDDED_NONE = 0, + /** Hash key is present */ + BTR_EMBEDDED_HASH = (1 << 14), + /** Record is embedded, may or may not be probed */ + BTR_EMBEDDED_SET = (1 << 15), + /** Embedded entry EQ probed entry */ + BTR_EMBEDDED_EQ = BTR_EMBEDDED_SET | (0), + /** Embedded entry LT probed entry */ + BTR_EMBEDDED_LT = BTR_EMBEDDED_SET | (1), + /** Embedded entry GT probed entry */ + BTR_EMBEDDED_GT = BTR_EMBEDDED_SET | (2), +}; + +D_CASSERT(BTR_CMP_EQ == (BTR_EMBEDDED_EQ ^ BTR_EMBEDDED_SET)); +D_CASSERT(BTR_CMP_LT == (BTR_EMBEDDED_LT ^ BTR_EMBEDDED_SET)); +D_CASSERT(BTR_CMP_GT == (BTR_EMBEDDED_GT ^ BTR_EMBEDDED_SET)); + /** * Trace for tree search. */ @@ -93,6 +114,11 @@ struct btr_trace { unsigned int tr_at; }; +struct btr_trace_info { + struct btr_trace *ti_trace; + uint32_t ti_embedded_info; +}; + /** backtrace depth */ #define BTR_TRACE_MAX 40 @@ -105,6 +131,10 @@ struct btr_context { struct btr_instance tc_tins; /** embedded iterator */ struct btr_iterator tc_itr; + /** embedded fake record for the purpose of handling embedded value */ + struct btr_record tc_record; + /** This provides space for the hkey for the fake record */ + struct ktr_hkey tc_hkey; /** cached configured tree order */ uint16_t tc_order; /** cached tree depth, avoid loading from slow memory */ @@ -128,7 +158,7 @@ struct btr_context { /** cached feature bits, avoid loading from slow memory */ uint64_t tc_feats; /** trace for the tree root */ - struct btr_trace *tc_trace; + struct btr_trace_info tc_trace; /** trace buffer */ struct btr_trace tc_traces[BTR_TRACE_MAX]; }; @@ -165,6 +195,40 @@ btr_has_tx(struct btr_context *tcx) return umem_has_tx(btr_umm(tcx)); } +/** The tree has support for the embedded value feature */ +static inline bool +btr_supports_embedded_value(struct btr_context *tcx) +{ + return (tcx->tc_feats & BTR_FEAT_EMBED_FIRST) != 0; +} + +/** Returns true if we should insert an embedded value */ +static bool +btr_use_embedded_value(struct btr_context *tcx) +{ + return (btr_supports_embedded_value(tcx) && tcx->tc_depth == 0); +} + +/** Returns true if the tree currently has an embedded value */ +static bool +btr_has_embedded_value(struct btr_context *tcx) +{ + return (tcx->tc_feats & BTR_FEAT_EMBEDDED); +} + +static inline int +btr_embedded_cmp(struct btr_context *tcx) +{ + D_ASSERT(tcx->tc_trace.ti_embedded_info != BTR_EMBEDDED_NONE); + return tcx->tc_trace.ti_embedded_info & ~(BTR_EMBEDDED_HASH | BTR_EMBEDDED_SET); +} + +static inline void +btr_embedded_hash_set(struct btr_context *tcx) +{ + tcx->tc_trace.ti_embedded_info |= BTR_EMBEDDED_HASH; +} + #define BTR_IS_DIRECT_KEY(feats) ((feats) & BTR_FEAT_DIRECT_KEY) static bool @@ -244,7 +308,8 @@ static void btr_context_set_depth(struct btr_context *tcx, unsigned int depth) { tcx->tc_depth = depth; - tcx->tc_trace = &tcx->tc_traces[BTR_TRACE_MAX - depth]; + tcx->tc_trace.ti_trace = &tcx->tc_traces[BTR_TRACE_MAX - depth]; + tcx->tc_trace.ti_embedded_info = BTR_EMBEDDED_NONE; } static inline btr_ops_t * @@ -336,19 +401,20 @@ btr_context_clone(struct btr_context *tcx, struct btr_context **tcx_p) * for the new root if \a level is -1. */ static void -btr_trace_set(struct btr_context *tcx, int level, - umem_off_t nd_off, int at) +btr_trace_set(struct btr_context *tcx, int level, umem_off_t nd_off, int at, uint32_t embedded) { D_ASSERT(at >= 0 && at < tcx->tc_order); D_ASSERT(tcx->tc_depth > 0); D_ASSERT(level >= 0 && level < tcx->tc_depth); - D_ASSERT(&tcx->tc_trace[level] < &tcx->tc_traces[BTR_TRACE_MAX]); + D_ASSERT(&tcx->tc_trace.ti_trace[level] < &tcx->tc_traces[BTR_TRACE_MAX]); - D_DEBUG(DB_TRACE, "trace[%d] "DF_X64"/%d\n", level, nd_off, at); + D_DEBUG(DB_TRACE, "trace[%d] " DF_X64 "/%d %s\n", level, nd_off, at, + (embedded & BTR_EMBEDDED_SET) ? "embedded" : ""); D_ASSERT(nd_off != UMOFF_NULL); - tcx->tc_trace[level].tr_node = nd_off; - tcx->tc_trace[level].tr_at = at; + tcx->tc_trace.ti_trace[level].tr_node = nd_off; + tcx->tc_trace.ti_trace[level].tr_at = at; + tcx->tc_trace.ti_embedded_info = embedded; } /** fetch the record of the specified trace level */ @@ -360,24 +426,33 @@ btr_trace2rec(struct btr_context *tcx, int level) D_ASSERT(tcx->tc_depth > 0); D_ASSERT(tcx->tc_depth > level); - trace = &tcx->tc_trace[level]; + trace = &tcx->tc_trace.ti_trace[level]; D_ASSERT(!UMOFF_IS_NULL(trace->tr_node)); + if (tcx->tc_trace.ti_embedded_info) { + D_ASSERT(level == 0); + tcx->tc_record.rec_off = trace->tr_node; + return &tcx->tc_record; + } + return btr_node_rec_at(tcx, trace->tr_node, trace->tr_at); } -#define \ -btr_trace_debug(tcx, trace, format, ...) \ -do { \ - umem_off_t __off = (trace)->tr_node; \ - int __level = (int)((trace) - (tcx)->tc_trace); \ - \ - D_DEBUG(DB_TRACE, \ - "node="DF_X64" (l=%d k=%d at=%d): " format, \ - __off, __level, \ - ((struct btr_node *)btr_off2ptr((tcx), __off))->tn_keyn,\ - (trace)->tr_at, ## __VA_ARGS__); \ -} while (0) +#define btr_trace_debug(tcx, trace, format, ...) \ + do { \ + umem_off_t __off = (trace)->tr_node; \ + int __level; \ + \ + if ((tcx)->tc_trace.ti_embedded_info) { \ + D_DEBUG(DB_TRACE, "Embedded record rec=" DF_X64 " (info=%d\n)", __off, \ + (tcx)->tc_trace.ti_embedded_info); \ + break; \ + } \ + __level = (int)((trace) - (tcx)->tc_trace.ti_trace); \ + D_DEBUG(DB_TRACE, "node=" DF_X64 " (l=%d k=%d at=%d): " format, __off, __level, \ + ((struct btr_node *)btr_off2ptr((tcx), __off))->tn_keyn, (trace)->tr_at, \ + ##__VA_ARGS__); \ + } while (0) void hkey_common_gen(d_iov_t *key_iov, void *hkey) @@ -847,10 +922,18 @@ btr_root_init(struct btr_context *tcx, struct btr_root *root, bool in_place) root->tr_class = tcx->tc_class; root->tr_feats = tcx->tc_feats; root->tr_order = tcx->tc_order; - if (tcx->tc_feats & BTR_FEAT_DYNAMIC_ROOT) - root->tr_node_size = 1; - else + if (tcx->tc_feats & BTR_FEAT_DYNAMIC_ROOT) { + /** If the first entry will be embedded, we'll need to insert 2 + * entries, so set the initial size accordingly. At present, + * we always go from 1 to 3. + */ + if (tcx->tc_feats & BTR_FEAT_EMBED_FIRST) + root->tr_node_size = MIN(3, tcx->tc_order); + else + root->tr_node_size = 1; + } else { root->tr_node_size = tcx->tc_order; + } root->tr_node = BTR_NODE_NULL; return 0; @@ -887,22 +970,58 @@ btr_root_tx_add(struct btr_context *tcx) return rc; } +static int +btr_embedded_create_hash(struct btr_context *tcx, bool force) +{ + struct btr_record *rec = &tcx->tc_record; + int rc; + d_iov_t old_key = {0}; + + if (!btr_has_embedded_value(tcx)) + return 0; + + if (force || (tcx->tc_trace.ti_embedded_info & BTR_EMBEDDED_HASH) == 0) { + rc = btr_rec_fetch(tcx, rec, &old_key, NULL); + if (rc != 0) { + D_ERROR("Failed to get key from embedded record: " DF_RC "\n", DP_RC(rc)); + return rc; + } + btr_hkey_gen(tcx, &old_key, &rec->rec_hkey[0]); + btr_embedded_hash_set(tcx); + } + + return 0; +} + /** * Create btr_node for the empty root, insert the first \a rec into it. */ int -btr_root_start(struct btr_context *tcx, struct btr_record *rec) +btr_root_start(struct btr_context *tcx, struct btr_record *rec, d_iov_t *key, bool embed) { struct btr_root *root; struct btr_record *rec_dst; + struct btr_record *existing_rec; + int cmp; struct btr_node *nd; umem_off_t nd_off; int rc; + int key_nr = 1; + int insertion_off = 0; + uint32_t embedded_setting = BTR_EMBEDDED_NONE; root = tcx->tc_tins.ti_root; - D_ASSERT(UMOFF_IS_NULL(root->tr_node)); - D_ASSERT(root->tr_depth == 0); + if (!btr_has_embedded_value(tcx)) { + D_ASSERT(UMOFF_IS_NULL(root->tr_node)); + D_ASSERT(root->tr_depth == 0); + } + + if (embed) { + embedded_setting = BTR_EMBEDDED_SET; + nd_off = rec->rec_off; + goto set_root; + } rc = btr_node_alloc(tcx, &nd_off); if (rc != 0) { @@ -913,11 +1032,52 @@ btr_root_start(struct btr_context *tcx, struct btr_record *rec) /* root is also leaf, records are stored in root */ btr_node_set(tcx, nd_off, BTR_NODE_ROOT | BTR_NODE_LEAF); nd = btr_off2ptr(tcx, nd_off); - nd->tn_keyn = 1; - rec_dst = btr_node_rec_at(tcx, nd_off, 0); + /** If we have an embedded entry, we need to insert 2 entries here */ + if (btr_has_embedded_value(tcx)) { + existing_rec = &tcx->tc_record; + existing_rec->rec_off = tcx->tc_tins.ti_root->tr_node; + cmp = btr_embedded_cmp(tcx); + /** If it's a direct key, we already know the comparison result. Otherwise, we + * must calculate the compare the hashed values. + */ + if (!btr_is_direct_key(tcx)) { + btr_hkey_gen(tcx, key, &rec->rec_hkey[0]); + + rc = btr_embedded_create_hash(tcx, false); + if (rc != 0) + return rc; + + cmp = btr_hkey_cmp(tcx, existing_rec, &rec->rec_hkey[0]); + } else { + memset(&tcx->tc_hkey, 0, sizeof(tcx->tc_hkey)); + } + + D_ASSERTF(cmp != BTR_CMP_EQ, "Hash collision is not supported\n"); + + /** Just insert the lesser key first and then set rec to greater + * key and let it insert there. + */ + key_nr = 2; + insertion_off = 1; + + rec_dst = btr_node_rec_at(tcx, nd_off, 0); + if (cmp == BTR_CMP_LT) { + /** This means the new key should go second */ + btr_rec_copy(tcx, rec_dst, existing_rec, 1); + } else { + /** This means the old key should go second */ + btr_rec_copy(tcx, rec_dst, rec, 1); + rec = existing_rec; + } + } + + nd->tn_keyn = key_nr; + + rec_dst = btr_node_rec_at(tcx, nd_off, insertion_off); btr_rec_copy(tcx, rec_dst, rec, 1); +set_root: if (btr_has_tx(tcx)) { rc = btr_root_tx_add(tcx); if (rc != 0) { @@ -929,9 +1089,16 @@ btr_root_start(struct btr_context *tcx, struct btr_record *rec) root->tr_node = nd_off; root->tr_depth = 1; + if (embed) { + root->tr_feats |= BTR_FEAT_EMBEDDED; + tcx->tc_feats = root->tr_feats; + } else if (btr_has_embedded_value(tcx)) { + root->tr_feats ^= BTR_FEAT_EMBEDDED; + tcx->tc_feats = root->tr_feats; + } btr_context_set_depth(tcx, root->tr_depth); - btr_trace_set(tcx, 0, nd_off, 0); + btr_trace_set(tcx, 0, nd_off, 0, embedded_setting); return 0; } @@ -976,7 +1143,7 @@ btr_root_grow(struct btr_context *tcx, umem_off_t off_left, nd->tn_child = off_left; nd->tn_keyn = 1; - at = !btr_node_is_equal(tcx, off_left, tcx->tc_trace->tr_node); + at = !btr_node_is_equal(tcx, off_left, tcx->tc_trace.ti_trace->tr_node); /* replace the root offset, increase tree level */ if (btr_has_tx(tcx)) { @@ -992,7 +1159,7 @@ btr_root_grow(struct btr_context *tcx, umem_off_t off_left, root->tr_depth++; btr_context_set_depth(tcx, root->tr_depth); - btr_trace_set(tcx, 0, nd_off, at); + btr_trace_set(tcx, 0, nd_off, at, BTR_EMBEDDED_NONE); return 0; } @@ -1003,27 +1170,14 @@ struct btr_check_alb { }; static int -btr_check_availability(struct btr_context *tcx, struct btr_check_alb *alb) +btr_check_record_availability(struct btr_context *tcx, struct btr_record *rec, uint32_t intent) { - struct btr_record *rec; - int rc; - - if (btr_ops(tcx)->to_check_availability == NULL) - return PROBE_RC_OK; - - if (UMOFF_IS_NULL(alb->nd_off)) { /* compare the leaf trace */ - struct btr_trace *trace = &tcx->tc_traces[BTR_TRACE_MAX - 1]; - - alb->nd_off = trace->tr_node; - alb->at = trace->tr_at; - } + int rc; - if (!btr_node_is_leaf(tcx, alb->nd_off)) + if (likely(btr_ops(tcx)->to_check_availability == NULL)) return PROBE_RC_OK; - rec = btr_node_rec_at(tcx, alb->nd_off, alb->at); - rc = btr_ops(tcx)->to_check_availability(&tcx->tc_tins, rec, - alb->intent); + rc = btr_ops(tcx)->to_check_availability(&tcx->tc_tins, rec, intent); if (rc == -DER_INPROGRESS) /* Uncertain */ return PROBE_RC_INPROGRESS; @@ -1054,8 +1208,32 @@ btr_check_availability(struct btr_context *tcx, struct btr_check_alb *alb) case ALB_UNAVAILABLE: default: /* Unavailable */ - return PROBE_RC_UNAVAILABLE; + break; } + + return PROBE_RC_UNAVAILABLE; +} + +static int +btr_check_availability(struct btr_context *tcx, struct btr_check_alb *alb) +{ + struct btr_record *rec; + + if (likely(btr_ops(tcx)->to_check_availability == NULL)) + return PROBE_RC_OK; + + if (UMOFF_IS_NULL(alb->nd_off)) { /* compare the leaf trace */ + struct btr_trace *trace = &tcx->tc_traces[BTR_TRACE_MAX - 1]; + + alb->nd_off = trace->tr_node; + alb->at = trace->tr_at; + } + + if (!btr_node_is_leaf(tcx, alb->nd_off)) + return PROBE_RC_OK; + + rec = btr_node_rec_at(tcx, alb->nd_off, alb->at); + return btr_check_record_availability(tcx, rec, alb->intent); } static int @@ -1128,7 +1306,7 @@ btr_split_at(struct btr_context *tcx, int level, umem_off_t off_left, umem_off_t off_right) { - struct btr_trace *trace = &tcx->tc_trace[level]; + struct btr_trace *trace = &tcx->tc_trace.ti_trace[level]; int order = tcx->tc_order; int split_at; bool left; @@ -1142,9 +1320,9 @@ btr_split_at(struct btr_context *tcx, int level, btr_trace_debug(tcx, trace, "split_at %d, insert to the %s node\n", split_at, left ? "left" : "right"); if (left) - btr_trace_set(tcx, level, off_left, trace->tr_at); + btr_trace_set(tcx, level, off_left, trace->tr_at, BTR_EMBEDDED_NONE); else - btr_trace_set(tcx, level, off_right, trace->tr_at - split_at); + btr_trace_set(tcx, level, off_right, trace->tr_at - split_at, BTR_EMBEDDED_NONE); return split_at; } @@ -1169,8 +1347,8 @@ btr_node_split_and_insert(struct btr_context *tcx, struct btr_trace *trace, bool leaf; bool right; - D_ASSERT(trace >= tcx->tc_trace); - level = trace - tcx->tc_trace; + D_ASSERT(trace >= tcx->tc_trace.ti_trace); + level = trace - tcx->tc_trace.ti_trace; off_left = trace->tr_node; rc = btr_node_alloc(tcx, &off_right); @@ -1406,6 +1584,84 @@ btr_probe_valid(dbtree_probe_opc_t opc) opc == BTR_PROBE_GE || opc == BTR_PROBE_LE); } +static enum btr_probe_rc +btr_probe_embedded(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, uint32_t intent, + d_iov_t *key, char hkey[DAOS_HKEY_MAX]) +{ + struct btr_record *rec = &tcx->tc_record; + int rc; + int rc2; + int cmp = BTR_CMP_EQ; /** for FIRST/LAST probe, this doesn't matter */ + int hash_flag = 0; + + D_ASSERTF(!btr_is_int_key(tcx), "Embedded root is incompatible with integer keys\n"); + + /** Use the fake record since only the user allocated part is stored + * in the tree root. + */ + rec->rec_off = tcx->tc_tins.ti_root->tr_node; + + rc = PROBE_RC_OK; + if (probe_opc & BTR_PROBE_SPEC) { + if (key != NULL) { + /** Simple case */ + cmp = btr_key_cmp(tcx, rec, key); + } else { + /** Restoring from anchor. For direct key, the decoded + * key will have been passed in. Otherwise, only the + * hash key is passed in so we need to compute the hash + * key for the embedded record + */ + rc2 = btr_embedded_create_hash(tcx, true); + if (rc2 != 0) { + D_ERROR("Could not create hash key from anchor: " DF_RC "\n", + DP_RC(rc2)); + D_GOTO(out, rc = PROBE_RC_ERR); + } + cmp = btr_hkey_cmp(tcx, rec, &hkey[0]); + hash_flag = BTR_EMBEDDED_HASH; + } + + D_ASSERTF(cmp != BTR_CMP_ERR, + "BTR_CMP_ERR is not supported with BTR_FEAT_EMBED_FIRST"); + switch (probe_opc) { + default: + D_ASSERT(0); + case BTR_PROBE_GE: + if (cmp == BTR_CMP_EQ) + break; + /** fall through */ + case BTR_PROBE_GT: + if (cmp != BTR_CMP_GT) + rc = PROBE_RC_NONE; + break; + case BTR_PROBE_LE: + if (cmp == BTR_CMP_EQ) { + rc = PROBE_RC_NONE; + break; + } + /** fall through */ + case BTR_PROBE_LT: + if (cmp != BTR_CMP_LT) + rc = PROBE_RC_NONE; + break; + case BTR_PROBE_EQ: + if (cmp != BTR_CMP_EQ) + rc = PROBE_RC_NONE; + break; + } + } /** For BTR_PROBE_{FIRST,LAST} */ + + /** Static asserts are in place to ensure this works */ + btr_trace_set(tcx, 0, rec->rec_off, 0, hash_flag | BTR_EMBEDDED_SET | cmp); +out: + tcx->tc_probe_rc = rc; + if (rc == PROBE_RC_ERR) + D_ERROR("Failed to probe: rc = %d\n", tcx->tc_probe_rc); + + return rc; +} + /** * Try to find \a key within a btree, it will store the searching path in * tcx::tc_traces. @@ -1448,6 +1704,11 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, goto out; } + if (btr_has_embedded_value(tcx)) { + rc = btr_probe_embedded(tcx, probe_opc, intent, key, hkey); + return rc; + } + nd_off = tcx->tc_tins.ti_root->tr_node; for (start = end = 0, level = 0, next_level = true ;;) { @@ -1499,8 +1760,8 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, * in the right child, otherwise it is in the left child. */ at += !(cmp & BTR_CMP_GT); - btr_trace_set(tcx, level, nd_off, at); - btr_trace_debug(tcx, &tcx->tc_trace[level], "probe child\n"); + btr_trace_set(tcx, level, nd_off, at, BTR_EMBEDDED_NONE); + btr_trace_debug(tcx, &tcx->tc_trace.ti_trace[level], "probe child\n"); /* Search the next level. */ nd_off = btr_node_child_at(tcx, nd_off, at); @@ -1512,7 +1773,7 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, D_ASSERT(level == tcx->tc_depth - 1); D_ASSERT(!UMOFF_IS_NULL(nd_off)); - btr_trace_set(tcx, level, nd_off, at); + btr_trace_set(tcx, level, nd_off, at, BTR_EMBEDDED_NONE); if (cmp == BTR_CMP_EQ && key && btr_has_collision(tcx)) { cmp = btr_cmp(tcx, nd_off, at, NULL, key); @@ -1533,8 +1794,8 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, D_ASSERT(0); case BTR_PROBE_FIRST: do { - alb.nd_off = tcx->tc_trace[level].tr_node; - alb.at = tcx->tc_trace[level].tr_at; + alb.nd_off = tcx->tc_trace.ti_trace[level].tr_node; + alb.at = tcx->tc_trace.ti_trace[level].tr_at; rc = btr_check_availability(tcx, &alb); } while (rc == PROBE_RC_UNAVAILABLE && btr_probe_next(tcx)); @@ -1544,8 +1805,8 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, case BTR_PROBE_LAST: do { - alb.nd_off = tcx->tc_trace[level].tr_node; - alb.at = tcx->tc_trace[level].tr_at; + alb.nd_off = tcx->tc_trace.ti_trace[level].tr_node; + alb.at = tcx->tc_trace.ti_trace[level].tr_at; rc = btr_check_availability(tcx, &alb); } while (rc == PROBE_RC_UNAVAILABLE && btr_probe_prev(tcx)); @@ -1567,8 +1828,8 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, * probed one, this if for the follow-on insert if * applicable. */ - btr_trace_set(tcx, level, nd_off, - at + !(cmp & BTR_CMP_GT)); + btr_trace_set(tcx, level, nd_off, at + !(cmp & BTR_CMP_GT), + BTR_EMBEDDED_NONE); } rc = PROBE_RC_NONE; @@ -1618,7 +1879,7 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, break; } - btr_trace_set(tcx, level, nd_off, saved); + btr_trace_set(tcx, level, nd_off, saved, BTR_EMBEDDED_NONE); rc = PROBE_RC_NONE; goto out; @@ -1662,12 +1923,12 @@ btr_probe(struct btr_context *tcx, dbtree_probe_opc_t probe_opc, D_ASSERT(cmp != BTR_CMP_EQ); /* GT/GE/LT/LE */ rc = PROBE_RC_OK; - out: +out: tcx->tc_probe_rc = rc; if (rc == PROBE_RC_ERR) D_ERROR("Failed to probe: rc = %d\n", tcx->tc_probe_rc); else if (level >= 0) - btr_trace_debug(tcx, &tcx->tc_trace[level], "\n"); + btr_trace_debug(tcx, &tcx->tc_trace.ti_trace[level], "\n"); return rc; } @@ -1692,9 +1953,13 @@ btr_probe_next(struct btr_context *tcx) if (btr_root_empty(tcx)) /* empty tree */ return false; - trace = &tcx->tc_trace[tcx->tc_depth - 1]; + trace = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]; btr_trace_debug(tcx, trace, "Probe the next\n"); + + if (btr_has_embedded_value(tcx)) /* For embedded value, there is no next entry */ + return false; + while (1) { bool leaf; @@ -1708,7 +1973,7 @@ btr_probe_next(struct btr_context *tcx) */ if (btr_node_is_root(tcx, nd_off) && trace->tr_at >= nd->tn_keyn - leaf) { - D_ASSERT(trace == tcx->tc_trace); + D_ASSERT(trace == tcx->tc_trace.ti_trace); D_DEBUG(DB_TRACE, "End\n"); return false; /* done */ } @@ -1724,7 +1989,7 @@ btr_probe_next(struct btr_context *tcx) break; } - while (trace < &tcx->tc_trace[tcx->tc_depth - 1]) { + while (trace < &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]) { umem_off_t tmp; tmp = btr_node_child_at(tcx, trace->tr_node, trace->tr_at); @@ -1748,16 +2013,20 @@ btr_probe_prev(struct btr_context *tcx) if (btr_root_empty(tcx)) /* empty tree */ return false; - trace = &tcx->tc_trace[tcx->tc_depth - 1]; + trace = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]; btr_trace_debug(tcx, trace, "Probe the prev\n"); + + if (btr_has_embedded_value(tcx)) /* For embedded value, there is no prev entry */ + return false; + while (1) { nd_off = trace->tr_node; nd = btr_off2ptr(tcx, nd_off); if (btr_node_is_root(tcx, nd_off) && trace->tr_at == 0) { - D_ASSERT(trace == tcx->tc_trace); + D_ASSERT(trace == tcx->tc_trace.ti_trace); D_DEBUG(DB_TRACE, "End\n"); return false; /* done */ } @@ -1777,7 +2046,7 @@ btr_probe_prev(struct btr_context *tcx) break; } - while (trace < &tcx->tc_trace[tcx->tc_depth - 1]) { + while (trace < &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]) { umem_off_t tmp; bool leaf; @@ -1888,7 +2157,7 @@ dbtree_fetch_cur(daos_handle_t toh, d_iov_t *key_out, d_iov_t *val_out) return rc; D_ASSERT(tcx->tc_depth > 0); - trace = &tcx->tc_trace[tcx->tc_depth - 1]; + trace = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]; nd = btr_off2ptr(tcx, trace->tr_node); D_ASSERT(trace->tr_at <= nd->tn_keyn); @@ -1931,7 +2200,7 @@ fetch_sibling(daos_handle_t toh, d_iov_t *key_out, d_iov_t *val_out, bool next, /* Save original trace */ if (!move) { - orig_trace = tcx->tc_trace; + orig_trace = tcx->tc_trace.ti_trace; memcpy(&orig_traces[0], &tcx->tc_traces[0], sizeof(tcx->tc_traces[0]) * BTR_TRACE_MAX); } @@ -1947,7 +2216,7 @@ fetch_sibling(daos_handle_t toh, d_iov_t *key_out, d_iov_t *val_out, bool next, out: /* Restore original trace */ if (!move) { - tcx->tc_trace = orig_trace; + tcx->tc_trace.ti_trace = orig_trace; memcpy(&tcx->tc_traces[0], &orig_traces[0], sizeof(tcx->tc_traces[0]) * BTR_TRACE_MAX); } @@ -2001,7 +2270,7 @@ btr_update(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out rc = btr_rec_update(tcx, rec, key, val, val_out); if (rc == -DER_NO_PERM) { /* cannot make inplace change */ - struct btr_trace *trace = &tcx->tc_trace[tcx->tc_depth - 1]; + struct btr_trace *trace = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]; if (btr_has_tx(tcx)) { rc = btr_node_tx_add(tcx, trace->tr_node); @@ -2035,9 +2304,12 @@ btr_insert(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out char str[BTR_PRINT_BUF]; union btr_rec_buf rec_buf = {0}; int rc; + bool embed = btr_use_embedded_value(tcx); rec = &rec_buf.rb_rec; - btr_hkey_gen(tcx, key, &rec->rec_hkey[0]); + + if (!embed) + btr_hkey_gen(tcx, key, &rec->rec_hkey[0]); rc = btr_rec_alloc(tcx, key, val, rec, val_out); if (rc != 0) { @@ -2049,11 +2321,11 @@ btr_insert(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out if (D_LOG_ENABLED(DB_TRACE)) rec_str = btr_rec_string(tcx, rec, true, str, BTR_PRINT_BUF); - if (tcx->tc_depth != 0) { + if (tcx->tc_depth != 0 && !btr_has_embedded_value(tcx)) { struct btr_trace *trace; /* trace for the leaf */ - trace = &tcx->tc_trace[tcx->tc_depth - 1]; + trace = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1]; btr_trace_debug(tcx, trace, "try to insert\n"); rc = btr_node_insert_rec(tcx, trace, rec); @@ -2065,10 +2337,11 @@ btr_insert(struct btr_context *tcx, d_iov_t *key, d_iov_t *val, d_iov_t *val_out } } else { - /* empty tree */ - D_DEBUG(DB_TRACE, "Add record %s to an empty tree\n", rec_str); + /* Tree is either empty or only has an embedded value */ + D_DEBUG(DB_TRACE, "Add record %s to %s\n", rec_str, + btr_has_embedded_value(tcx) ? "tree with embedded value" : "empty tree"); - rc = btr_root_start(tcx, rec); + rc = btr_root_start(tcx, rec, key, embed); if (rc != 0) { D_DEBUG(DB_TRACE, "Failed to start the tree: "DF_RC"\n", DP_RC(rc)); @@ -2266,6 +2539,43 @@ dbtree_upsert(daos_handle_t toh, dbtree_probe_opc_t opc, uint32_t intent, return btr_tx_end(tcx, rc); } +/** When pairing down from 2 entries in the root to 2 we can remove + * the node and restore the embedded entry. This function will modify + * the root and set flags accordingly. + */ +static int +btr_node_del_embed(struct btr_context *tcx, struct btr_trace *trace, struct btr_root *root, + void *args) +{ + struct btr_record *rec; + struct btr_node *nd; + int rc; + + nd = btr_off2ptr(tcx, trace->tr_node); + D_ASSERT(nd->tn_keyn > 0 && nd->tn_keyn > trace->tr_at); + + /** Delete the record */ + rec = btr_node_rec_at(tcx, trace->tr_node, trace->tr_at); + rc = btr_rec_free(tcx, rec, args); + if (rc != 0) + return rc; + + /** Now handle the embedding */ + D_ASSERT(trace->tr_at <= 1); + rec = btr_node_rec_at(tcx, trace->tr_node, 1 - trace->tr_at); + + if (btr_has_tx(tcx)) { + rc = btr_root_tx_add(tcx); + if (rc != 0) + return rc; + } + + root->tr_node = rec->rec_off; + root->tr_feats |= BTR_FEAT_EMBEDDED; + tcx->tc_feats = root->tr_feats; + return btr_node_free(tcx, trace->tr_node); +} + /** * Delete the leaf record pointed by @cur_tr from the current node, then fill * the deletion gap by shifting remainded records on the specified direction. @@ -2911,6 +3221,7 @@ btr_root_del_rec(struct btr_context *tcx, struct btr_trace *trace, void *args) struct btr_node *node; struct btr_root *root; int rc = 0; + int threshold = 1; root = tcx->tc_tins.ti_root; node = btr_off2ptr(tcx, trace->tr_node); @@ -2918,12 +3229,21 @@ btr_root_del_rec(struct btr_context *tcx, struct btr_trace *trace, void *args) D_DEBUG(DB_TRACE, "Delete record/child from tree root, depth=%d\n", root->tr_depth); - if (btr_node_is_leaf(tcx, trace->tr_node)) { - D_DEBUG(DB_TRACE, "Delete leaf from the root, key_nr=%d.\n", - node->tn_keyn); + if (btr_has_embedded_value(tcx) || btr_node_is_leaf(tcx, trace->tr_node)) { + if (D_LOG_ENABLED(DB_TRACE)) { + if (btr_has_embedded_value(tcx)) { + D_DEBUG(DB_TRACE, "Delete embedded record from the root\n"); + } else { + D_DEBUG(DB_TRACE, "Delete leaf from the root, key_nr=%d.\n", + node->tn_keyn); + } + } + + if (btr_supports_embedded_value(tcx)) + threshold = 2; /* the root is also a leaf node */ - if (node->tn_keyn > 1) { + if (node->tn_keyn > threshold) { /* have more than one record, simply remove the record * to be deleted. */ @@ -2934,8 +3254,9 @@ btr_root_del_rec(struct btr_context *tcx, struct btr_trace *trace, void *args) } rc = btr_node_del_leaf_only(tcx, trace, true, args); + } else if (node->tn_keyn == 2) { + rc = btr_node_del_embed(tcx, trace, root, args); } else { - rc = btr_node_destroy(tcx, trace->tr_node, args, NULL); if (rc != 0) return rc; @@ -2948,6 +3269,10 @@ btr_root_del_rec(struct btr_context *tcx, struct btr_trace *trace, void *args) root->tr_depth = 0; root->tr_node = BTR_NODE_NULL; + if (btr_has_embedded_value(tcx)) { + root->tr_feats ^= BTR_FEAT_EMBEDDED; + tcx->tc_feats = root->tr_feats; + } btr_context_set_depth(tcx, 0); D_DEBUG(DB_TRACE, "Tree is empty now.\n"); @@ -3002,8 +3327,8 @@ btr_delete(struct btr_context *tcx, void *args) struct btr_trace *cur_tr; int rc = 0; - for (cur_tr = &tcx->tc_trace[tcx->tc_depth - 1];; cur_tr = par_tr) { - if (cur_tr == tcx->tc_trace) { /* root */ + for (cur_tr = &tcx->tc_trace.ti_trace[tcx->tc_depth - 1];; cur_tr = par_tr) { + if (cur_tr == tcx->tc_trace.ti_trace) { /* root */ rc = btr_root_del_rec(tcx, cur_tr, args); break; } @@ -3450,11 +3775,25 @@ btr_node_destroy(struct btr_context *tcx, umem_off_t nd_off, void *args, bool *empty_rc) { struct btr_node *nd = btr_off2ptr(tcx, nd_off); + struct btr_record *rec; bool leaf = btr_node_is_leaf(tcx, nd_off); bool empty = true; int rc; int i; + if (btr_has_embedded_value(tcx)) { + btr_trace_set(tcx, 0, tcx->tc_tins.ti_root->tr_node, 0, BTR_EMBEDDED_SET); + rec = btr_trace2rec(tcx, 0); + rc = btr_rec_free(tcx, rec, args); + if (rc != 0) + return rc; + if (tcx->tc_creds_on) { + D_ASSERT(tcx->tc_creds > 0); + tcx->tc_creds--; + } + goto out; + } + /* NB: don't need to call TX_ADD_RANGE(nd_off, ...) because I never * change it so nothing to undo on transaction failure, I may destroy * it later by calling TX_FREE which is transactional safe. @@ -3464,8 +3803,6 @@ btr_node_destroy(struct btr_context *tcx, umem_off_t nd_off, if (leaf) { for (i = nd->tn_keyn - 1; i >= 0; i--) { - struct btr_record *rec; - rec = btr_node_rec_at(tcx, nd_off, i); rc = btr_rec_free(tcx, rec, args); if (rc != 0) @@ -3522,6 +3859,7 @@ btr_node_destroy(struct btr_context *tcx, umem_off_t nd_off, nd->tn_keyn = i; } +out: if (empty_rc) *empty_rc = empty; @@ -3896,6 +4234,9 @@ dbtree_iter_fetch(daos_handle_t ih, d_iov_t *key, anchor->da_type = DAOS_ANCHOR_TYPE_KEY; } else { + rc = btr_embedded_create_hash(tcx, false); + if (rc != 0) + return rc; btr_hkey_copy(tcx, (char *)&anchor->da_buf[0], &rec->rec_hkey[0]); anchor->da_type = DAOS_ANCHOR_TYPE_HKEY; @@ -4150,8 +4491,21 @@ btr_class_init(umem_off_t root_off, struct btr_root *root, if (tc->tc_feats & BTR_FEAT_SKIP_LEAF_REBAL) *tree_feats |= BTR_FEAT_SKIP_LEAF_REBAL; - /** Only check btree managed bits */ - if ((*tree_feats & tc->tc_feats) != (*tree_feats & BTR_FEAT_MASK)) { + if ((*tree_feats & (BTR_FEAT_UINT_KEY | BTR_FEAT_EMBED_FIRST)) == + (BTR_FEAT_UINT_KEY | BTR_FEAT_EMBED_FIRST)) { + /** The key is normally stored in value but with integer + * keys, it's stored in the btr_record. While we would + * save an indirection if we added 8 bytes to the value + * allocation, we would have 8 unrecoverable bytes stored + * with that value. It would also add some complication + * to the key retrieval logic. For now, integer keys are + * not supported for this optimization. + */ + *tree_feats ^= BTR_FEAT_EMBED_FIRST; + } + + /** Only check btree managed bits that can be set in tr_class */ + if ((*tree_feats & tc->tc_feats) != (*tree_feats & BTR_EXT_FEAT_MASK)) { D_ERROR("Unsupported features "DF_X64"/"DF_X64"\n", *tree_feats, tc->tc_feats); return -DER_PROTO; @@ -4198,6 +4552,11 @@ dbtree_class_register(unsigned int tree_class, uint64_t tree_feats, D_ASSERT(ops->to_rec_alloc != NULL); D_ASSERT(ops->to_rec_free != NULL); + if (tree_feats & BTR_FEAT_EMBED_FIRST) { + D_ASSERT(ops->to_check_availability == NULL); + D_ASSERT(ops->to_key_cmp != NULL); + } + btr_class_registered[tree_class].tc_ops = ops; btr_class_registered[tree_class].tc_feats = tree_feats; diff --git a/src/common/debug.c b/src/common/debug.c index 5096e3ec92c7..b586f0e50d3e 100644 --- a/src/common/debug.c +++ b/src/common/debug.c @@ -104,14 +104,16 @@ unsigned int daos_io_bypass; static void io_bypass_init(void) { - char *str = getenv(DENV_IO_BYPASS); - char *tok; - char *saved_ptr; + char *str; + char *tok; + char *saved_ptr; + char *env; - if (!str) + d_agetenv_str(&env, DENV_IO_BYPASS); + if (env == NULL) return; - tok = strtok_r(str, ",", &saved_ptr); + tok = strtok_r(env, ",", &saved_ptr); while (tok) { struct io_bypass *iob; @@ -129,6 +131,7 @@ io_bypass_init(void) } tok = str; }; + d_freeenv_str(&env); } void @@ -162,17 +165,18 @@ daos_debug_init_ex(char *logfile, d_dbug_t logmask) } /* honor the env variable first */ - logfile = getenv(D_LOG_FILE_ENV); + rc = d_agetenv_str(&logfile, D_LOG_FILE_ENV); if (logfile == NULL || strlen(logfile) == 0) { flags |= DLOG_FLV_STDOUT; - logfile = NULL; + d_freeenv_str(&logfile); } else if (!strncmp(logfile, "/dev/null", 9)) { /* Don't set up logging or log to stdout if the log file is /dev/null */ - logfile = NULL; + d_freeenv_str(&logfile); } rc = d_log_init_adv("DAOS", logfile, flags, logmask, DLOG_CRIT, log_id_cb); + d_freeenv_str(&logfile); if (rc != 0) { D_PRINT_ERR("Failed to init DAOS debug log: "DF_RC"\n", DP_RC(rc)); diff --git a/src/common/misc.c b/src/common/misc.c index f7d6b1ddad0b..afd8ed8fa38e 100644 --- a/src/common/misc.c +++ b/src/common/misc.c @@ -684,13 +684,15 @@ daos_crt_init_opt_get(bool server, int ctx_nr) * 1) now sockets provider cannot create more than 16 contexts for SEP * 2) some problems if SEP communicates with regular EP. */ - addr_env = (crt_phy_addr_t)getenv(CRT_PHY_ADDR_ENV); + d_agetenv_str(&addr_env, CRT_PHY_ADDR_ENV); if (addr_env != NULL && strncmp(addr_env, CRT_SOCKET_PROV, strlen(CRT_SOCKET_PROV)) == 0) { D_INFO("for sockets provider force it to use regular EP.\n"); daos_crt_init_opt.cio_use_sep = 0; + d_freeenv_str(&addr_env); goto out; } + d_freeenv_str(&addr_env); daos_crt_init_opt.cio_use_sep = 1; diff --git a/src/common/prop.c b/src/common/prop.c index 3c48bf15f598..b03751bc8626 100644 --- a/src/common/prop.c +++ b/src/common/prop.c @@ -415,6 +415,21 @@ daos_prop_valid(daos_prop_t *prop, bool pool, bool input) return false; } break; + case DAOS_PROP_PO_SVC_OPS_ENABLED: + val = prop->dpp_entries[i].dpe_val; + if (val > 1) { + D_ERROR("invalid svc_ops_enabled " DF_U64 ".\n", val); + return false; + } + break; + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: + val = prop->dpp_entries[i].dpe_val; + if ((val < DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MIN) || + (val > DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MAX)) { + D_ERROR("invalid svc_ops_entry_age " DF_U64 ".\n", val); + return false; + } + break; /* container-only properties */ case DAOS_PROP_CO_LAYOUT_TYPE: val = prop->dpp_entries[i].dpe_val; diff --git a/src/common/tests/btree.c b/src/common/tests/btree.c index 5e5c913f6738..69a595d61097 100644 --- a/src/common/tests/btree.c +++ b/src/common/tests/btree.c @@ -88,6 +88,17 @@ ik_hkey_gen(struct btr_instance *tins, d_iov_t *key_iov, void *hkey) memcpy(hkey, ikey, sizeof(*ikey)); } +static int +ik_key_cmp(struct btr_instance *tins, struct btr_record *rec, d_iov_t *key_iov) +{ + int key1 = *(uint64_t *)key_iov->iov_buf; + struct ik_rec *irec = umem_off2ptr(&tins->ti_umm, rec->rec_off); + + if (irec->ir_key < key1) + return BTR_CMP_LT; + return irec->ir_key > key1 ? BTR_CMP_GT : BTR_CMP_EQ; +} + static int ik_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, struct btr_record *rec, d_iov_t *val_out) @@ -101,7 +112,7 @@ ik_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, irec = umem_off2ptr(&tins->ti_umm, irec_off); - irec->ir_key = *(int *)key_iov->iov_buf; + irec->ir_key = *(uint64_t *)key_iov->iov_buf; irec->ir_val_size = irec->ir_val_msize = val_iov->iov_len; irec->ir_val_off = umem_alloc(&tins->ti_umm, val_iov->iov_len); @@ -242,14 +253,15 @@ ik_rec_stat(struct btr_instance *tins, struct btr_record *rec, } static btr_ops_t ik_ops = { - .to_hkey_size = ik_hkey_size, - .to_hkey_gen = ik_hkey_gen, - .to_rec_alloc = ik_rec_alloc, - .to_rec_free = ik_rec_free, - .to_rec_fetch = ik_rec_fetch, - .to_rec_update = ik_rec_update, - .to_rec_string = ik_rec_string, - .to_rec_stat = ik_rec_stat, + .to_hkey_size = ik_hkey_size, + .to_hkey_gen = ik_hkey_gen, + .to_key_cmp = ik_key_cmp, + .to_rec_alloc = ik_rec_alloc, + .to_rec_free = ik_rec_free, + .to_rec_fetch = ik_rec_fetch, + .to_rec_update = ik_rec_update, + .to_rec_string = ik_rec_string, + .to_rec_stat = ik_rec_stat, }; #define IK_SEP ',' @@ -276,6 +288,9 @@ ik_btr_open_create(void **state) if (arg[0] == '+') { feats = BTR_FEAT_UINT_KEY; arg += 1; + } else if (arg[0] == '%') { + feats = BTR_FEAT_EMBED_FIRST; + arg += 1; } if (arg[0] == 'i') { /* inplace create/open */ inplace = true; @@ -1052,8 +1067,8 @@ main(int argc, char **argv) } } - rc = dbtree_class_register(IK_TREE_CLASS, - dynamic_flag | BTR_FEAT_UINT_KEY, &ik_ops); + rc = dbtree_class_register( + IK_TREE_CLASS, dynamic_flag | BTR_FEAT_EMBED_FIRST | BTR_FEAT_UINT_KEY, &ik_ops); D_ASSERT(rc == 0); if (ik_utx == NULL) { diff --git a/src/common/tests/btree.sh b/src/common/tests/btree.sh index b5aa37164bc8..03a35d47f9ee 100755 --- a/src/common/tests/btree.sh +++ b/src/common/tests/btree.sh @@ -16,7 +16,7 @@ elif [ "$USE_VALGRIND" = "pmemcheck" ]; then VCMD="valgrind --tool=pmemcheck" fi -ORDER=${ORDER:-3} +ORDER=${ORDER:-11} DDEBUG=${DDEBUG:-0} @@ -67,6 +67,12 @@ while [ $# -gt 0 ]; do UINT="+" test_conf_pre="${test_conf_pre} ukey" ;; + emb) + shift + # reuse this flag since they are mutually exclusive + UINT="%" + test_conf_pre="${test_conf_pre} ukey" + ;; direct) BTR=${SL_BUILD_DIR}/src/common/tests/btree_direct KEYS=${KEYS:-"delta,lambda,kappa,omega,beta,alpha,epsilon"} diff --git a/src/common/tests/btree_direct.c b/src/common/tests/btree_direct.c index 8a23461f4205..8fb3fa8b6fdd 100644 --- a/src/common/tests/btree_direct.c +++ b/src/common/tests/btree_direct.c @@ -89,30 +89,48 @@ static void sk_key_decode(struct btr_instance *tins, } static int -sk_key_cmp(struct btr_instance *tins, struct btr_record *rec, - d_iov_t *key_iov) +key_cmp(const void *k1, const void *k2) { - struct sk_rec *srec; - char *s1; - char *s2; - uint64_t len; - int rc; - - srec = (struct sk_rec *)umem_off2ptr(&tins->ti_umm, rec->rec_off); + const d_iov_t *key1 = k1; + const d_iov_t *key2 = k2; + const char *s1 = key1->iov_buf; + const char *s2 = key2->iov_buf; + uint64_t len; + int rc; - /* NB: Since strings are null terminated, this should suffice to - * make shorter string less than larger one - */ - len = min(srec->sr_key_len, key_iov->iov_len); + len = min(key1->iov_len, key2->iov_len); - s1 = &srec->sr_key[0]; - s2 = key_iov->iov_buf; rc = strncasecmp(s1, s2, len); if (rc != 0) - return dbtree_key_cmp_rc(rc); + return rc; + + return strncmp(s1, s2, len); +} + +struct kv_node { + d_iov_t key; + d_iov_t val; +}; + +/* Sort the keys (for sanity check) */ +static void +sk_btr_sort_keys(struct kv_node *kv, unsigned int key_nr) +{ + qsort(kv, key_nr, sizeof(*kv), key_cmp); +} + +static int +sk_key_cmp(struct btr_instance *tins, struct btr_record *rec, d_iov_t *key_iov2) +{ + struct sk_rec *srec; + d_iov_t key_iov1; + + srec = (struct sk_rec *)umem_off2ptr(&tins->ti_umm, rec->rec_off); + + d_iov_set(&key_iov1, &srec->sr_key[0], srec->sr_key_len); - return dbtree_key_cmp_rc(strncmp(s1, s2, len)); + return dbtree_key_cmp_rc(key_cmp(&key_iov1, key_iov2)); } static int @@ -299,6 +317,10 @@ sk_btr_open_create(void **state) } if (create && arg != NULL) { + if (arg[0] == '%') { + feats = BTR_FEAT_EMBED_FIRST; + arg += 1; + } if (arg[0] == 'i') { /* inplace create/open */ inplace = true; if (arg[1] != SK_SEP) { @@ -681,12 +703,6 @@ sk_btr_iterate(void **state) D_PRINT("Test Passed\n"); } -struct kv_node { - d_iov_t key; - d_iov_t val; -}; - - /* Mix up the keys */ static void sk_btr_mix_keys(struct kv_node *kv, unsigned int key_nr) @@ -706,33 +722,6 @@ sk_btr_mix_keys(struct kv_node *kv, unsigned int key_nr) } } -static int -key_cmp(const void *k1, const void *k2) -{ - const d_iov_t *key1 = k1; - const d_iov_t *key2 = k2; - const char *s1 = key1->iov_buf; - const char *s2 = key2->iov_buf; - uint64_t len; - int rc; - - len = min(key1->iov_len, key2->iov_len); - - rc = strncasecmp(s1, s2, len); - - if (rc != 0) - return rc; - - return strncmp(s1, s2, len); -} - -/* Sort the keys (for sanity check) */ -static void -sk_btr_sort_keys(struct kv_node *kv, unsigned int key_nr) -{ - qsort(kv, key_nr, sizeof(*kv), key_cmp); -} - const char valid[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; #define INT_LEN 32 @@ -1161,7 +1150,8 @@ main(int argc, char **argv) if (rc != 0) return rc; - rc = dbtree_class_register(SK_TREE_CLASS, BTR_FEAT_DIRECT_KEY, &sk_ops); + rc = dbtree_class_register(SK_TREE_CLASS, BTR_FEAT_EMBED_FIRST | BTR_FEAT_DIRECT_KEY, + &sk_ops); D_ASSERT(rc == 0); stop_idx = argc-1; diff --git a/src/common/tests_dmg_helpers.c b/src/common/tests_dmg_helpers.c index fccf5a41e57d..aa5cfa29b14a 100644 --- a/src/common/tests_dmg_helpers.c +++ b/src/common/tests_dmg_helpers.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -646,6 +646,56 @@ dmg_pool_create(const char *dmg_config_file, D_GOTO(out, rc = -DER_NOMEM); has_label = true; } + + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SCRUB_MODE); + if (entry != NULL) { + const char *scrub_str = NULL; + + switch (entry->dpe_val) { + case DAOS_SCRUB_MODE_OFF: + scrub_str = "off"; + break; + case DAOS_SCRUB_MODE_LAZY: + scrub_str = "lazy"; + break; + case DAOS_SCRUB_MODE_TIMED: + scrub_str = "timed"; + break; + default: + break; + } + + if (scrub_str) { + args = cmd_push_arg(args, &argcount, "--properties=scrub:%s ", + scrub_str); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + } + + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENABLED); + if (entry != NULL) { + args = cmd_push_arg(args, &argcount, "--properties=svc_ops_enabled:%zu ", + entry->dpe_val); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENTRY_AGE); + if (entry != NULL) { + args = cmd_push_arg(args, &argcount, "--properties=svc_ops_entry_age:%zu ", + entry->dpe_val); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SPACE_RB); + if (entry != NULL) { + args = cmd_push_arg(args, &argcount, "--properties=space_rb:%zu ", + entry->dpe_val); + if (args == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } } if (!has_label) { @@ -1032,6 +1082,7 @@ parse_device_info(struct json_object *smd_dev, device_list *devices, { struct json_object *tmp; struct json_object *dev = NULL; + struct json_object *ctrlr = NULL; struct json_object *target = NULL; struct json_object *targets; int tgts_len; @@ -1081,19 +1132,24 @@ parse_device_info(struct json_object *smd_dev, device_list *devices, } devices[*disks].n_tgtidx = tgts_len; - if (!json_object_object_get_ex(dev, "dev_state", &tmp)) { - D_ERROR("unable to extract state from JSON\n"); + if (!json_object_object_get_ex(dev, "rank", &tmp)) { + D_ERROR("unable to extract rank from JSON\n"); return -DER_INVAL; } + devices[*disks].rank = atoi(json_object_to_json_string(tmp)); - snprintf(devices[*disks].state, sizeof(devices[*disks].state), - "%s", json_object_to_json_string(tmp)); + if (!json_object_object_get_ex(dev, "ctrlr", &ctrlr)) { + D_ERROR("unable to extract ctrlr obj from JSON\n"); + return -DER_INVAL; + } - if (!json_object_object_get_ex(dev, "rank", &tmp)) { - D_ERROR("unable to extract rank from JSON\n"); + if (!json_object_object_get_ex(ctrlr, "dev_state", &tmp)) { + D_ERROR("unable to extract state from JSON\n"); return -DER_INVAL; } - devices[*disks].rank = atoi(json_object_to_json_string(tmp)); + + snprintf(devices[*disks].state, sizeof(devices[*disks].state), "%s", + json_object_to_json_string(tmp)); *disks = *disks + 1; } @@ -1235,9 +1291,10 @@ dmg_storage_query_device_health(const char *dmg_config_file, char *host, struct json_object *storage_map = NULL; struct json_object *smd_info = NULL; struct json_object *storage_info = NULL; - struct json_object *health_info = NULL; + struct json_object *health_stats = NULL; struct json_object *devices = NULL; struct json_object *dev_info = NULL; + struct json_object *ctrlr_info = NULL; struct json_object *tmp = NULL; char uuid_str[DAOS_UUID_STR_SIZE]; int argcount = 0; @@ -1284,10 +1341,13 @@ dmg_storage_query_device_health(const char *dmg_config_file, char *host, } dev_info = json_object_array_get_idx(devices, 0); - json_object_object_get_ex(dev_info, "health", &health_info); - if (health_info != NULL) { - json_object_object_get_ex(health_info, stats, - &tmp); + if (!json_object_object_get_ex(dev_info, "ctrlr", &ctrlr_info)) { + D_ERROR("unable to extract ctrlr details from JSON\n"); + D_GOTO(out_json, rc = -DER_INVAL); + } + json_object_object_get_ex(ctrlr_info, "health_stats", &health_stats); + if (health_stats != NULL) { + json_object_object_get_ex(health_stats, stats, &tmp); strcpy(stats, json_object_to_json_string(tmp)); } } diff --git a/src/container/oid_iv.c b/src/container/oid_iv.c index d1041184006f..f10f5d34f7e2 100644 --- a/src/container/oid_iv.c +++ b/src/container/oid_iv.c @@ -31,6 +31,7 @@ struct oid_iv_entry { struct oid_iv_range rg; /** protect the entry */ ABT_mutex lock; + void *current_req; }; /** Priv data in the iv layer */ @@ -130,7 +131,14 @@ oid_iv_ent_update(struct ds_iv_entry *ns_entry, struct ds_iv_key *iv_key, D_ASSERT(priv != NULL); entry = ns_entry->iv_value.sg_iovs[0].iov_buf; - ABT_mutex_lock(entry->lock); + rc = ABT_mutex_trylock(entry->lock); + /* For retry requests, from _iv_op(), the lock may not be released + * in some cases. + */ + if (rc == ABT_ERR_MUTEX_LOCKED && entry->current_req != src) + return -DER_BUSY; + + entry->current_req = src; avail = &entry->rg; oids = src->sg_iovs[0].iov_buf; diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 938970f62328..ed1137f1b4be 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -31,6 +31,7 @@ #define DAOS_POOL_GLOBAL_VERSION_WITH_CONT_MDTIMES 2 #define DAOS_POOL_GLOBAL_VERSION_WITH_CONT_NHANDLES 2 #define DAOS_POOL_GLOBAL_VERSION_WITH_CONT_EX_EVICT 2 +#define DAOS_POOL_GLOBAL_VERSION_WITH_OIT_OID_KVS 2 static int cont_prop_read(struct rdb_tx *tx, struct cont *cont, uint64_t bits, @@ -131,18 +132,8 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id, if (rc != 0) goto err_hdls; - /* cs_ops */ - rc = rdb_path_clone(&svc->cs_root, &svc->cs_ops); - if (rc != 0) - goto err_hdls; - rc = rdb_path_push(&svc->cs_ops, &ds_cont_prop_svc_ops); - if (rc != 0) - goto err_svcops; - return 0; -err_svcops: - rdb_path_fini(&svc->cs_ops); err_hdls: rdb_path_fini(&svc->cs_hdls); err_conts: @@ -160,7 +151,6 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id, static void cont_svc_fini(struct cont_svc *svc) { - rdb_path_fini(&svc->cs_ops); rdb_path_fini(&svc->cs_hdls); rdb_path_fini(&svc->cs_conts); rdb_path_fini(&svc->cs_uuids); @@ -1241,15 +1231,17 @@ cont_create(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont_svc *sv } /* Create the oit oids index KVS. */ - attr.dsa_class = RDB_KVS_GENERIC; - attr.dsa_order = 16; - rc = rdb_tx_create_kvs(tx, &kvs, &ds_cont_prop_oit_oids, &attr); - if (rc != 0) { - D_ERROR(DF_CONT" failed to create container oit oids KVS: " - ""DF_RC"\n", - DP_CONT(pool_hdl->sph_pool->sp_uuid, - in->cci_op.ci_uuid), DP_RC(rc)); - D_GOTO(out_kvs, rc); + if (pool_hdl->sph_global_ver >= DAOS_POOL_GLOBAL_VERSION_WITH_OIT_OID_KVS) { + attr.dsa_class = RDB_KVS_GENERIC; + attr.dsa_order = 16; + rc = rdb_tx_create_kvs(tx, &kvs, &ds_cont_prop_oit_oids, &attr); + if (rc != 0) { + D_ERROR(DF_CONT" failed to create container oit oids KVS: " + ""DF_RC"\n", + DP_CONT(pool_hdl->sph_pool->sp_uuid, + in->cci_op.ci_uuid), DP_RC(rc)); + D_GOTO(out_kvs, rc); + } } out_kvs: @@ -1540,6 +1532,7 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, struct d_ownership owner; uint32_t force; struct daos_acl *acl; + bool need_destroy_oid_oit_kvs = false; cont_destroy_in_get_data(rpc, opc_get(rpc->cr_opc), cont_proto_ver, &force, NULL); D_DEBUG(DB_MD, DF_CONT ": processing rpc: %p force=%u\n", @@ -1580,10 +1573,29 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cont_ec_agg_delete(cont->c_svc, cont->c_uuid); + if (pool_hdl->sph_global_ver >= DAOS_POOL_GLOBAL_VERSION_WITH_OIT_OID_KVS) { + need_destroy_oid_oit_kvs = true; + } else { + d_iov_t value; + + d_iov_set(&value, NULL, 0); + rc = rdb_tx_lookup(tx, &cont->c_prop, &ds_cont_prop_oit_oids, &value); + if (rc && rc != -DER_NONEXIST) { + DL_ERROR(rc, "failed to lookup oit oid kvs pool/cont: " DF_CONTF, + DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid)); + goto out_prop; + } + /* There was a bug that oit oids might be created already see DAOS-14799 */ + if (rc == 0) + need_destroy_oid_oit_kvs = true; + } + /* Destroy oit oids index KVS. */ - rc = rdb_tx_destroy_kvs(tx, &cont->c_prop, &ds_cont_prop_oit_oids); - if (rc != 0) - goto out_prop; + if (need_destroy_oid_oit_kvs) { + rc = rdb_tx_destroy_kvs(tx, &cont->c_prop, &ds_cont_prop_oit_oids); + if (rc != 0) + goto out_prop; + } /* Destroy the handle index KVS. */ rc = rdb_tx_destroy_kvs(tx, &cont->c_prop, &ds_cont_prop_handles); @@ -4628,19 +4640,30 @@ upgrade_cont_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *varg) goto out; } - if (from_global_ver < 2) { + if (from_global_ver < DAOS_POOL_GLOBAL_VERSION_WITH_OIT_OID_KVS) { struct rdb_kvs_attr attr; - /* Create the oit oids index KVS. */ - attr.dsa_class = RDB_KVS_GENERIC; - attr.dsa_order = 16; - rc = rdb_tx_create_kvs(ap->tx, &cont->c_prop, &ds_cont_prop_oit_oids, &attr); - if (rc != 0) { - D_ERROR(DF_CONT" failed to create container oit oids KVS: " - ""DF_RC"\n", - DP_CONT(ap->pool_uuid, cont_uuid), DP_RC(rc)); + d_iov_set(&value, NULL, 0); + rc = rdb_tx_lookup(ap->tx, &cont->c_prop, &ds_cont_prop_oit_oids, &value); + /* There was a bug that oit oids might be created already see DAOS-14799 */ + if (rc && rc != -DER_NONEXIST) { + DL_ERROR(rc, "failed to lookup oit oid kvs pool/cont: " DF_CONTF, + DP_CONT(ap->pool_uuid, cont_uuid)); goto out; } + + /* Create the oit oids index KVS. */ + if (rc == -DER_NONEXIST) { + attr.dsa_class = RDB_KVS_GENERIC; + attr.dsa_order = 16; + rc = rdb_tx_create_kvs(ap->tx, &cont->c_prop, &ds_cont_prop_oit_oids, &attr); + if (rc != 0) { + D_ERROR(DF_CONT" failed to create container oit oids KVS: " + ""DF_RC"\n", + DP_CONT(ap->pool_uuid, cont_uuid), DP_RC(rc)); + goto out; + } + } } entry = daos_prop_entry_get(prop, DAOS_PROP_CO_OBJ_VERSION); @@ -5237,142 +5260,47 @@ static int cont_op_lookup(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, crt_rpc_t *rpc, int cont_proto_ver, bool *is_dup, struct ds_pool_svc_op_val *valp) { - struct cont_op_v8_in *in8 = crt_req_get(rpc); - struct ds_pool_svc_op_key op_key; - struct ds_pool_svc_op_val op_val; - d_iov_t key; - d_iov_t val; - uint32_t svc_ops_enabled; - bool proto_enabled; - bool dup = false; - crt_opcode_t opc = opc_get(rpc->cr_opc); - int rc = 0; + struct cont_op_v8_in *in8 = crt_req_get(rpc); + crt_opcode_t opc = opc_get(rpc->cr_opc); + int rc = 0; /* If client didn't provide a key (old protocol), skip */ - proto_enabled = (cont_proto_ver >= CONT_PROTO_VER_WITH_SVC_OP_KEY); - if (!proto_enabled) + if (cont_proto_ver < CONT_PROTO_VER_WITH_SVC_OP_KEY) goto out; /* If the operation is not a write, skip (read-only ops not tracked for duplicates) */ if (!cont_op_is_write(opc)) goto out; - /* If enabled, lookup client-provided op key, assign dup_op accordingly. */ - /* TODO: lookup from a cached value in struct pool_svc rather than rdb */ - d_iov_set(&val, &svc_ops_enabled, sizeof(svc_ops_enabled)); - rc = rdb_tx_lookup(tx, &svc->cs_root, &ds_cont_prop_svc_ops_enabled, &val); - if (rc == -DER_NONEXIST) { - rc = 0; - goto out; - } else if (rc != 0) { - DL_ERROR(rc, DF_CONT ": failed to lookup svc_ops_enabled", - DP_CONT(pool_hdl->sph_pool->sp_uuid, in8->ci_uuid)); - goto out; - } - if (!svc_ops_enabled) - goto out; - - uuid_copy(op_key.ok_client_id, in8->ci_cli_id); - op_key.ok_client_time = in8->ci_time; - d_iov_set(&key, &op_key, sizeof(op_key)); - d_iov_set(&val, &op_val, sizeof(op_val)); - - rc = rdb_tx_lookup(tx, &svc->cs_ops, &key, &val); - if (rc == 0) { - /* found - this is a retry/duplicate RPC being handled */ - D_DEBUG(DB_MD, - DF_CONT ": retry RPC detected client=" DF_UUID " time=" DF_X64 " rc=%d\n", - DP_CONT(pool_hdl->sph_pool->sp_uuid, in8->ci_uuid), DP_UUID(in8->ci_cli_id), - in8->ci_time, op_val.ov_rc); - dup = true; - } else if (rc == -DER_NONEXIST) { - /* not found - new, unique RPC being handled */ - rc = 0; - } else { - DL_ERROR(rc, DF_CONT ": failed to lookup RPC client=" DF_UUID " time=" DF_X64, - DP_CONT(pool_hdl->sph_pool->sp_uuid, in8->ci_uuid), - DP_UUID(in8->ci_cli_id), in8->ci_time); - goto out; - } + rc = ds_pool_svc_ops_lookup(tx, NULL /* pool_svc */, pool_hdl->sph_pool->sp_uuid, + &in8->ci_cli_id, in8->ci_time, is_dup, valp); out: - if (rc == 0) { - *is_dup = dup; - if (dup) - *valp = op_val; - } return rc; } /* Save results of the operation in svc_ops KVS, in the existing rdb_tx context. */ static int cont_op_save(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, crt_rpc_t *rpc, - int cont_proto_ver, int rc_in, struct ds_pool_svc_op_val *op_valp) + bool dup_op, int cont_proto_ver, int rc_in, struct ds_pool_svc_op_val *op_valp) { struct cont_op_v8_in *in8 = crt_req_get(rpc); - d_iov_t key; - d_iov_t val; - struct ds_pool_svc_op_key op_key; - uint32_t svc_ops_enabled; - bool proto_enabled; crt_opcode_t opc = opc_get(rpc->cr_opc); int rc = 0; - op_valp->ov_rc = rc_in; + if (!dup_op) + op_valp->ov_rc = rc_in; /* If client didn't provide a key (old protocol), skip */ - proto_enabled = (cont_proto_ver >= CONT_PROTO_VER_WITH_SVC_OP_KEY); - if (!proto_enabled) + if (cont_proto_ver < CONT_PROTO_VER_WITH_SVC_OP_KEY) goto out; /* If the operation is not a write, skip (read-only ops not tracked for duplicates) */ if (!cont_op_is_write(opc)) goto out; - /* If enabled, save client-provided op key and result of the operation. */ - d_iov_set(&val, &svc_ops_enabled, sizeof(svc_ops_enabled)); - rc = rdb_tx_lookup(tx, &svc->cs_root, &ds_cont_prop_svc_ops_enabled, &val); - if (rc == -DER_NONEXIST) { - rc = 0; - goto out; - } else if (rc != 0) { - DL_ERROR(rc, DF_CONT ": failed to lookup svc_ops_enabled", - DP_CONT(pool_hdl->sph_pool->sp_uuid, in8->ci_uuid)); - goto out; - } - if (!svc_ops_enabled) - goto out; - - /* TODO: implement mechanism to constrain rdb space usage by this KVS. */ - goto out; - - /* Save result in cs_ops KVS, only if the return code is "definitive" (not retryable). */ - if (!daos_rpc_retryable_rc(op_valp->ov_rc)) { - /* If the write operation failed, discard its (unwanted) updates first. */ - if (op_valp->ov_rc != 0) - rdb_tx_discard(tx); - - uuid_copy(op_key.ok_client_id, in8->ci_cli_id); - op_key.ok_client_time = in8->ci_time; - d_iov_set(&key, &op_key, sizeof(op_key)); - d_iov_set(&val, op_valp, sizeof(*op_valp)); - - rc = rdb_tx_lookup(tx, &svc->cs_ops, &key, &val); - if (rc != -DER_NONEXIST) { - D_ASSERT(rc != 0); - goto out; - } - - rc = rdb_tx_update(tx, &svc->cs_ops, &key, &val); - if (rc != 0) { - DL_ERROR(rc, - DF_CONT ": failed to update svc_ops client=" DF_UUID - " time=" DF_X64, - DP_CONT(pool_hdl->sph_pool->sp_uuid, in8->ci_uuid), - DP_UUID(in8->ci_cli_id), in8->ci_time); - goto out; - } - } + rc = ds_pool_svc_ops_save(tx, NULL /* pool_svc */, svc->cs_pool_uuid, &in8->ci_cli_id, + in8->ci_time, dup_op, rc_in, op_valp); out: return rc; @@ -5399,8 +5327,13 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, struct ds_pool_svc_op_val op_val; bool fi_pass_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_PASS_NOREPLY); bool fi_fail_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_FAIL_NOREPLY); + bool fi_pass_nl_noreply; + bool fi_fail_nl_noreply; int rc; + fi_pass_nl_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_PASS_NOREPLY_NEWLDR); + fi_fail_nl_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_FAIL_NOREPLY_NEWLDR); + rc = rdb_tx_begin(svc->cs_rsvc->s_db, svc->cs_rsvc->s_term, &tx); if (rc != 0) goto out; @@ -5414,7 +5347,7 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, rc = cont_op_lookup(&tx, pool_hdl, svc, rpc, cont_proto_ver, &dup_op, &op_val); if (rc != 0) goto out_lock; - else if (fi_fail_noreply) + else if (fi_fail_noreply || fi_fail_nl_noreply) goto out_commit; switch (opc) { @@ -5439,6 +5372,8 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, uuid_copy(olbl_out->colo_uuid, cont->c_uuid); break; case CONT_DESTROY_BYLABEL: + if (dup_op) + goto out_commit; cont_op_in_get_label(rpc, opc, cont_proto_ver, &clbl); rc = cont_lookup_bylabel(&tx, svc, clbl, &cont); if (rc != 0) @@ -5447,6 +5382,8 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, dup_op, &op_val); break; default: + if ((opc == CONT_DESTROY) && dup_op) + goto out_commit; rc = cont_lookup(&tx, svc, in->ci_uuid, &cont); if (rc != 0) goto out_commit; @@ -5465,10 +5402,9 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, goto out_commit; out_commit: - if ((rc == 0) && !dup_op && fi_fail_noreply) + if ((rc == 0) && !dup_op && (fi_fail_noreply || fi_fail_nl_noreply)) rc = -DER_MISC; - if (!dup_op) - rc = cont_op_save(&tx, pool_hdl, svc, rpc, cont_proto_ver, rc, &op_val); + rc = cont_op_save(&tx, pool_hdl, svc, rpc, dup_op, cont_proto_ver, rc, &op_val); if (rc != 0) goto out_contref; @@ -5498,13 +5434,25 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, if ((rc == 0) && !dup_op && fi_pass_noreply) { rc = -DER_TIMEDOUT; - D_DEBUG(DB_MD, DF_UUID ": fault injected: DAOS_MD_OP_PASS_NOREPLY\n", - DP_UUID(in->ci_uuid)); + D_DEBUG(DB_MD, DF_CONT ": fault injected: DAOS_MD_OP_PASS_NOREPLY\n", + DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid)); } if ((rc == -DER_MISC) && !dup_op && fi_fail_noreply) { rc = -DER_TIMEDOUT; - D_DEBUG(DB_MD, DF_UUID ": fault injected: DAOS_MD_OP_FAIL_NOREPLY\n", - DP_UUID(in->ci_uuid)); + D_DEBUG(DB_MD, DF_CONT ": fault injected: DAOS_MD_OP_FAIL_NOREPLY\n", + DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid)); + } + if ((rc == 0) && !dup_op && fi_pass_nl_noreply) { + rc = -DER_TIMEDOUT; + D_DEBUG(DB_MD, DF_CONT ": fault injected: DAOS_MD_OP_PASS_NOREPLY_NEWLDR\n", + DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid)); + rdb_resign(svc->cs_rsvc->s_db, svc->cs_rsvc->s_term); + } + if ((rc == -DER_MISC) && !dup_op && fi_fail_nl_noreply) { + rc = -DER_TIMEDOUT; + D_DEBUG(DB_MD, DF_CONT ": fault injected: DAOS_MD_OP_FAIL_NOREPLY_NEWLDR\n", + DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid)); + rdb_resign(svc->cs_rsvc->s_db, svc->cs_rsvc->s_term); } D_DEBUG(DB_MD, DF_CONT": opc=%d returning, "DF_RC"\n", diff --git a/src/container/srv_internal.h b/src/container/srv_internal.h index 67ccc74582d4..5dc9f1333b27 100644 --- a/src/container/srv_internal.h +++ b/src/container/srv_internal.h @@ -85,8 +85,7 @@ struct cont_svc { rdb_path_t cs_root; /* root KVS */ rdb_path_t cs_uuids; /* container UUIDs KVS */ rdb_path_t cs_conts; /* container KVS */ - rdb_path_t cs_hdls; /* container handle KVS */ - rdb_path_t cs_ops; /* metadata ops KVS */ + rdb_path_t cs_hdls; /* container handle KVS */ struct ds_pool *cs_pool; /* Manage the EC aggregation epoch */ diff --git a/src/container/srv_layout.c b/src/container/srv_layout.c index 359e6574034d..35bb16e0b712 100644 --- a/src/container/srv_layout.c +++ b/src/container/srv_layout.c @@ -17,9 +17,6 @@ RDB_STRING_KEY(ds_cont_prop_, cuuids); RDB_STRING_KEY(ds_cont_prop_, conts); RDB_STRING_KEY(ds_cont_prop_, cont_handles); RDB_STRING_KEY(ds_cont_prop_, oit_oids); -RDB_STRING_KEY(ds_cont_prop_, svc_ops); -RDB_STRING_KEY(ds_cont_prop_, svc_ops_enabled); - /* Container properties KVS */ RDB_STRING_KEY(ds_cont_prop_, ghce); RDB_STRING_KEY(ds_cont_prop_, ghpce); diff --git a/src/container/srv_layout.h b/src/container/srv_layout.h index f9e9b0a590bf..d4f2622aba27 100644 --- a/src/container/srv_layout.h +++ b/src/container/srv_layout.h @@ -43,9 +43,9 @@ * * extern d_iov_t ds_cont_prop_new_key; comment_on_value_type * - * Note 1. The "new_key" name in ds_cont_prop_new_key must not appear (with very few exceptions) - * in the root KVS in src/pool/srv_layout.h, that is, there must usually not be a - * ds_pool_prop_new_key, because the two root KVSs are the same RDB KVS. + * Note 1. The "new_key" name in ds_cont_prop_new_key must not appear in the root KVS in + * src/pool/srv_layout.h, that is, there must not be a ds_pool_prop_new_key, because the two root + * KVSs are the same RDB KVS. * * Note 2. The comment_on_value_type shall focus on the value type only; * usage shall be described above in this comment following existing @@ -53,9 +53,7 @@ */ extern d_iov_t ds_cont_prop_cuuids; /* container UUIDs KVS */ extern d_iov_t ds_cont_prop_conts; /* container KVS */ -extern d_iov_t ds_cont_prop_cont_handles; /* container handle KVS */ -extern d_iov_t ds_cont_prop_svc_ops; /* service ops KVS - common to pool, container */ -extern d_iov_t ds_cont_prop_svc_ops_enabled; /* uint32_t - common to pool, container */ +extern d_iov_t ds_cont_prop_cont_handles; /* container handle KVS */ /* Please read the IMPORTANT notes above before adding new keys. */ /* diff --git a/src/control/SConscript b/src/control/SConscript index 933b090b4429..490ca2a94cd2 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -1,7 +1,9 @@ """Build DAOS Control Plane""" # pylint: disable=too-many-locals import os +import socket from binascii import b2a_hex +from datetime import datetime, timezone from os import urandom from os.path import join @@ -43,13 +45,19 @@ def gen_build_id(): return '0x' + buildid.decode() -def go_ldflags(): +def go_ldflags(benv): "Create the ldflags option for the Go build." + build_host = '' + if not is_release_build(benv): + build_host = socket.getfqdn() + build_time = datetime.now(timezone.utc).astimezone().isoformat() Import('daos_version', 'conf_dir') path = 'github.com/daos-stack/daos/src/control/build' return ' '.join([f'-X {path}.DaosVersion={daos_version}', f'-X {path}.ConfigDir={conf_dir}', + f'-X {path}.BuildTime={build_time}', + f'-X {path}.BuildHost={build_host}', f'-B $({gen_build_id()}$)']) @@ -72,7 +80,7 @@ def install_go_bin(env, name, libs=None, install_man=False): target = env.d_run_command(name, sources, libs, f'cd {gosrc}; {env.d_go_bin} build -mod vendor ' - + f'-ldflags "{go_ldflags()}" ' + + f'-ldflags "{go_ldflags(env)}" ' + f'{get_build_flags(env)} ' + f'{get_build_tags(env)} ' + f'-o {build_bin} {install_src}') diff --git a/src/control/build/string.go b/src/control/build/string.go index 7f8c6a97fdef..b4b372312a85 100644 --- a/src/control/build/string.go +++ b/src/control/build/string.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "strings" + "time" ) func revString(version string) string { @@ -41,17 +42,24 @@ func String(name string) string { // MarshalJSON returns a JSON string containing a structured representation of // the binary build info. func MarshalJSON(name string) ([]byte, error) { + // Not a fatal error if the build time can't be parsed. + buildTime, _ := time.Parse(time.RFC3339, BuildTime) + return json.Marshal(&struct { - Name string `json:"name"` - Version string `json:"version"` - Revision string `json:"revision,omitempty"` - Dirty bool `json:"dirty,omitempty"` - Release bool `json:"release,omitempty"` + Name string `json:"name"` + Version string `json:"version"` + Revision string `json:"revision,omitempty"` + Dirty bool `json:"dirty,omitempty"` + Release bool `json:"release,omitempty"` + BuildHost string `json:"build_host,omitempty"` + BuildTime time.Time `json:"build_time,omitempty"` }{ - Name: name, - Version: DaosVersion, - Revision: Revision, - Dirty: DirtyBuild, - Release: ReleaseBuild, + Name: name, + Version: DaosVersion, + Revision: Revision, + Dirty: DirtyBuild, + Release: ReleaseBuild, + BuildHost: BuildHost, + BuildTime: buildTime, }) } diff --git a/src/control/build/variables.go b/src/control/build/variables.go index f915306098fe..1a140d3b4875 100644 --- a/src/control/build/variables.go +++ b/src/control/build/variables.go @@ -11,9 +11,13 @@ import "time" var ( // ConfigDir should be set via linker flag using the value of CONF_DIR. - ConfigDir string = "./" + ConfigDir = "./" // DaosVersion should be set via linker flag using the value of DAOS_VERSION. - DaosVersion string = "unset" + DaosVersion = "unset" + // BuildTime should be set via linker flag using the value of BUILD_TIME. + BuildTime = "" + // BuildHost should be set via linker flag using the value of BUILD_HOST. + BuildHost = "" // ControlPlaneName defines a consistent name for the control plane server. ControlPlaneName = "DAOS Control Server" // DataPlaneName defines a consistent name for the engine. diff --git a/src/control/cmd/daos/filesystem.go b/src/control/cmd/daos/filesystem.go index e7fcf828a457..8baa1c935357 100644 --- a/src/control/cmd/daos/filesystem.go +++ b/src/control/cmd/daos/filesystem.go @@ -291,7 +291,6 @@ type fsCheckCmd struct { FsckFlags FsCheckFlag `long:"flags" short:"f" description:"comma-separated flags: print, remove, relink, verify, evict"` DirName string `long:"dir-name" short:"n" description:"directory name under lost+found to store leaked oids (a timestamp dir would be created if this is not specified)"` - Evict bool `long:"evict" short:"e" description:"evict all open handles on the container"` } func (cmd *fsCheckCmd) Execute(_ []string) error { diff --git a/src/control/cmd/daos/main.go b/src/control/cmd/daos/main.go index 377aa22ad1eb..a38632426c88 100644 --- a/src/control/cmd/daos/main.go +++ b/src/control/cmd/daos/main.go @@ -24,15 +24,15 @@ import ( ) type cliOptions struct { - Debug bool `long:"debug" description:"enable debug output"` - Verbose bool `long:"verbose" description:"enable verbose output (when applicable)"` - JSON bool `long:"json" short:"j" description:"enable JSON output"` - Container containerCmd `command:"container" alias:"cont" description:"perform tasks related to DAOS containers"` - Pool poolCmd `command:"pool" description:"perform tasks related to DAOS pools"` + Debug bool `long:"debug" description:"Enable debug output"` + Verbose bool `long:"verbose" description:"Enable verbose output (when applicable)"` + JSON bool `long:"json" short:"j" description:"Enable JSON output"` + Container containerCmd `command:"container" alias:"cont" description:"Perform tasks related to DAOS containers"` + Pool poolCmd `command:"pool" description:"Perform tasks related to DAOS pools"` Filesystem fsCmd `command:"filesystem" alias:"fs" description:"POSIX filesystem operations"` Object objectCmd `command:"object" alias:"obj" description:"DAOS object operations"` System systemCmd `command:"system" alias:"sys" description:"DAOS system operations"` - Version versionCmd `command:"version" description:"print daos version"` + Version versionCmd `command:"version" description:"Print daos version"` ManPage cmdutil.ManCmd `command:"manpage" hidden:"true"` } diff --git a/src/control/cmd/daos_agent/attachinfo.go b/src/control/cmd/daos_agent/attachinfo.go index 76e42e53974c..4f65605656d3 100644 --- a/src/control/cmd/daos_agent/attachinfo.go +++ b/src/control/cmd/daos_agent/attachinfo.go @@ -7,7 +7,6 @@ package main import ( - "context" "fmt" "os" @@ -21,6 +20,7 @@ import ( type dumpAttachInfoCmd struct { configCmd ctlInvokerCmd + cmdutil.LogCmd cmdutil.JSONOutputCmd Output string `short:"o" long:"output" default:"stdout" description:"Dump output to this location"` } @@ -36,7 +36,7 @@ func (cmd *dumpAttachInfoCmd) Execute(_ []string) error { out = f } - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.GetAttachInfoReq{ AllRanks: true, } diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go index 3a6f7a14368c..ad8fc40924d4 100644 --- a/src/control/cmd/daos_agent/config.go +++ b/src/control/cmd/daos_agent/config.go @@ -8,7 +8,7 @@ package main import ( "fmt" - "io/ioutil" + "os" "time" "github.com/pkg/errors" @@ -23,7 +23,6 @@ import ( const ( defaultConfigFile = "daos_agent.yml" defaultRuntimeDir = "/var/run/daos_agent" - defaultLogFile = "/tmp/daos_agent.log" ) type refreshMinutes time.Duration @@ -73,20 +72,20 @@ type FabricInterfaceConfig struct { // LoadConfig reads a config file and uses it to populate a Config. func LoadConfig(cfgPath string) (*Config, error) { if cfgPath == "" { - return nil, errors.New("no path supplied") + return nil, errors.New("no config path supplied") } - data, err := ioutil.ReadFile(cfgPath) + data, err := os.ReadFile(cfgPath) if err != nil { - return nil, err + return nil, errors.Wrap(err, "reading config file") } cfg := DefaultConfig() if err := yaml.UnmarshalStrict(data, cfg); err != nil { - return nil, err + return nil, errors.Wrapf(err, "parsing config: %s", cfgPath) } if !daos.SystemNameIsValid(cfg.SystemName) { - return nil, fmt.Errorf("invalid system name: %q", cfg.SystemName) + return nil, fmt.Errorf("invalid system name: %s", cfg.SystemName) } return cfg, nil @@ -100,7 +99,6 @@ func DefaultConfig() *Config { ControlPort: build.DefaultControlPort, AccessPoints: []string{localServer}, RuntimeDir: defaultRuntimeDir, - LogFile: defaultLogFile, LogLevel: common.DefaultControlLogLevel, TransportConfig: security.DefaultAgentTransportConfig(), } diff --git a/src/control/cmd/daos_agent/config_test.go b/src/control/cmd/daos_agent/config_test.go index e21920e3735c..3a58e2c5616d 100644 --- a/src/control/cmd/daos_agent/config_test.go +++ b/src/control/cmd/daos_agent/config_test.go @@ -87,7 +87,7 @@ transport_config: expErr error }{ "empty path": { - expErr: errors.New("no path"), + expErr: errors.New("no config path"), }, "bad path": { path: "/not/real/path", diff --git a/src/control/cmd/daos_agent/main.go b/src/control/cmd/daos_agent/main.go index f6906a1fc834..ac5646365d0c 100644 --- a/src/control/cmd/daos_agent/main.go +++ b/src/control/cmd/daos_agent/main.go @@ -132,18 +132,14 @@ func parseOpts(args []string, opts *cliOptions, invoker control.Invoker, log *lo logCmd.SetLog(log) } - if jsonCmd, ok := cmd.(cmdutil.JSONOutputter); ok && opts.JSON { - jsonCmd.EnableJSONOutput(os.Stdout, &wroteJSON) - // disable output on stdout other than JSON - log.ClearLevel(logging.LogLevelInfo) - } - if opts.Debug { log.SetLevel(logging.LogLevelTrace) } - if opts.JSONLogs { - log.WithJSONOutput() + if jsonCmd, ok := cmd.(cmdutil.JSONOutputter); ok && opts.JSON { + jsonCmd.EnableJSONOutput(os.Stdout, &wroteJSON) + // disable output on stdout other than JSON + log.ClearLevel(logging.LogLevelInfo) } switch cmd.(type) { @@ -164,69 +160,15 @@ func parseOpts(args []string, opts *cliOptions, invoker control.Invoker, log *lo } } - cfg := DefaultConfig() - if cfgPath != "" { - var err error - if cfg, err = LoadConfig(cfgPath); err != nil { - return errors.WithMessage(err, "failed to load agent configuration") - } - - // Command line debug option overrides log level in config file - if !opts.Debug { - log.WithLogLevel(logging.LogLevel(cfg.LogLevel)) - } - log.Debugf("agent config loaded from %s", cfgPath) + cfg, err := processConfig(log, cmd, opts, cfgPath) + if err != nil { + return err } if suppCmd, ok := cmd.(supportAgentConfig); ok { suppCmd.setSupportConf(cfgPath) } - if opts.RuntimeDir != "" { - log.Debugf("Overriding socket path from config file with %s", opts.RuntimeDir) - cfg.RuntimeDir = opts.RuntimeDir - } - - if opts.LogFile != "" { - log.Debugf("Overriding LogFile path from config file with %s", opts.LogFile) - cfg.LogFile = opts.LogFile - } - - if opts.Insecure { - log.Debugf("Overriding AllowInsecure from config file with %t", opts.Insecure) - cfg.TransportConfig.AllowInsecure = true - } - - if cfg.LogFile != "" { - f, err := common.AppendFile(cfg.LogFile) - if err != nil { - log.Errorf("Failure creating log file: %s", err) - return err - } - defer f.Close() - - // Create an additional set of loggers which append everything - // to the specified file. - log.WithErrorLogger(logging.NewErrorLogger("agent", f)). - WithNoticeLogger(logging.NewNoticeLogger("agent", f)). - WithInfoLogger(logging.NewInfoLogger("agent", f)). - WithDebugLogger(logging.NewDebugLogger(f)). - WithTraceLogger(logging.NewTraceLogger(f)) - } - - if err := cfg.TransportConfig.PreLoadCertData(); err != nil { - return errors.Wrap(err, "Unable to load Certificate Data") - } - - var err error - if cfg.AccessPoints, err = common.ParseHostList(cfg.AccessPoints, cfg.ControlPort); err != nil { - return errors.Wrap(err, "Failed to parse config access_points") - } - - if cfgCmd, ok := cmd.(configSetter); ok { - cfgCmd.setConfig(cfg) - } - if ctlCmd, ok := cmd.(ctlInvoker); ok { // Generate a control config based on the loaded agent config. ctlCfg := control.DefaultConfig() @@ -251,6 +193,75 @@ func parseOpts(args []string, opts *cliOptions, invoker control.Invoker, log *lo return err } +func processConfig(log logging.Logger, cmd flags.Commander, opts *cliOptions, cfgPath string) (*Config, error) { + cfg := DefaultConfig() + if cfgPath != "" { + var err error + if cfg, err = LoadConfig(cfgPath); err != nil { + return nil, errors.Wrap(err, "failed to load agent configuration") + } + } + + if opts.LogFile != "" { + log.Debugf("Overriding LogFile path from config file with %s", opts.LogFile) + cfg.LogFile = opts.LogFile + } + + if opts.Debug { + cfg.LogLevel = common.ControlLogLevelTrace + } + + if err := configureLogging(log, cmd, cfg, opts); err != nil { + return nil, err + } + + if opts.RuntimeDir != "" { + log.Debugf("Overriding socket path from config file with %s", opts.RuntimeDir) + cfg.RuntimeDir = opts.RuntimeDir + } + + if opts.Insecure { + log.Debugf("Overriding AllowInsecure from config file with %t", opts.Insecure) + cfg.TransportConfig.AllowInsecure = true + } + + if err := cfg.TransportConfig.PreLoadCertData(); err != nil { + return nil, errors.Wrap(err, "Unable to load Certificate Data") + } + + var err error + if cfg.AccessPoints, err = common.ParseHostList(cfg.AccessPoints, cfg.ControlPort); err != nil { + return nil, errors.Wrap(err, "Failed to parse config access_points") + } + + if cfgCmd, ok := cmd.(configSetter); ok { + cfgCmd.setConfig(cfg) + } + + if cfgPath != "" { + log.Infof("loaded agent config from path: %s", cfgPath) + } + + return cfg, nil +} + +func configureLogging(log logging.Logger, cmd flags.Commander, cfg *Config, opts *cliOptions) error { + if logCmd, ok := cmd.(cmdutil.LogSetter); ok { + logCmd.SetLog(log) + + logCfg := cmdutil.LogConfig{ + LogFile: cfg.LogFile, + LogLevel: cfg.LogLevel, + JSON: opts.JSONLogs, + } + if err := cmdutil.ConfigureLogger(log, logCfg); err != nil { + return err + } + } + + return nil +} + func main() { var opts cliOptions log := logging.NewCommandLineLogger() diff --git a/src/control/cmd/daos_agent/network.go b/src/control/cmd/daos_agent/network.go index 42c3407ca49a..28215a9c3195 100644 --- a/src/control/cmd/daos_agent/network.go +++ b/src/control/cmd/daos_agent/network.go @@ -7,7 +7,6 @@ package main import ( - "context" "strings" "github.com/daos-stack/daos/src/control/cmd/dmg/pretty" @@ -32,7 +31,7 @@ func (cmd *netScanCmd) Execute(_ []string) error { fabricScanner := hwprov.DefaultFabricScanner(cmd.Logger) - results, err := fabricScanner.Scan(context.Background(), prov) + results, err := fabricScanner.Scan(cmd.MustLogCtx(), prov) if err != nil { return nil } diff --git a/src/control/cmd/daos_agent/start.go b/src/control/cmd/daos_agent/start.go index 791af47ba639..cb5505234d52 100644 --- a/src/control/cmd/daos_agent/start.go +++ b/src/control/cmd/daos_agent/start.go @@ -54,7 +54,7 @@ func (cmd *startCmd) Execute(_ []string) error { cmd.Infof("Starting %s (pid %d)", versionString(), os.Getpid()) startedAt := time.Now() - parent, shutdown := context.WithCancel(context.Background()) + parent, shutdown := context.WithCancel(cmd.MustLogCtx()) defer shutdown() var shuttingDown atm.Bool diff --git a/src/control/cmd/daos_server/auto.go b/src/control/cmd/daos_server/auto.go index d4ccc1c2ec12..93a3fd83626f 100644 --- a/src/control/cmd/daos_server/auto.go +++ b/src/control/cmd/daos_server/auto.go @@ -192,5 +192,5 @@ func (cmd *configGenCmd) Execute(_ []string) error { return err } - return cmd.confGenPrint(context.Background(), getLocalFabric, getLocalStorage) + return cmd.confGenPrint(cmd.MustLogCtx(), getLocalFabric, getLocalStorage) } diff --git a/src/control/cmd/daos_server/auto_test.go b/src/control/cmd/daos_server/auto_test.go index 46a4d84a2eb8..526b5bcaa54c 100644 --- a/src/control/cmd/daos_server/auto_test.go +++ b/src/control/cmd/daos_server/auto_test.go @@ -235,13 +235,13 @@ func TestDaosServer_Auto_confGen(t *testing.T) { } // Nr hugepages expected with 18+1 (extra MD-on-SSD sys-xstream) targets * 2 engines * 512 // pages-per-target. - mdonssdNrHugepages := 19 * 2 * 512 - mdonssdHugeMemGiB := (humanize.MiByte * 2 * mdonssdNrHugepages) / humanize.GiByte + mdOnSSDNrHugepages := 19 * 2 * 512 + mdOnSSDHugeMemGiB := (humanize.MiByte * 2 * mdOnSSDNrHugepages) / humanize.GiByte // Total mem to meet requirements 39GiB hugeMem, 2GiB per engine rsvd, 6GiB sys rsvd, 4GiB // per engine RAM-disk. - mdonssdMemTotalGiB := humanize.GiByte * (mdonssdHugeMemGiB + (2 * engRsvdGiB) + sysRsvdGiB + + mdOnSSDMemTotalGiB := humanize.GiByte * (mdOnSSDHugeMemGiB + (2 * engRsvdGiB) + sysRsvdGiB + (2 * ramdiskGiB) + 1 /* add 1GiB buffer */) - mdonssdEngineCfgs := []*engine.Config{ + mdOnSSDEngineCfgs := []*engine.Config{ control.MockEngineCfgTmpfs(0, ramdiskGiB, control.MockBdevTierWithRole(0, storage.BdevRoleWAL, 2), control.MockBdevTierWithRole(0, storage.BdevRoleMeta|storage.BdevRoleData, 4)). @@ -541,7 +541,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { }, MemInfo: &common.MemInfo{ HugepageSizeKiB: 2048, - MemTotalKiB: mdonssdMemTotalGiB / humanize.KiByte, + MemTotalKiB: mdOnSSDMemTotalGiB / humanize.KiByte, }, NvmeDevices: storage.NvmeControllers{ storage.MockNvmeController(1), @@ -550,8 +550,8 @@ func TestDaosServer_Auto_confGen(t *testing.T) { storage.MockNvmeController(4), }, }, - expCfg: control.MockServerCfg("ofi+psm2", mdonssdEngineCfgs). - WithNrHugepages(mdonssdNrHugepages). + expCfg: control.MockServerCfg("ofi+psm2", mdOnSSDEngineCfgs). + WithNrHugepages(mdOnSSDNrHugepages). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"). WithControlMetadata(controlMetadata), @@ -573,7 +573,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { }, MemInfo: &common.MemInfo{ HugepageSizeKiB: 2048, - MemTotalKiB: mdonssdMemTotalGiB / humanize.KiByte, + MemTotalKiB: mdOnSSDMemTotalGiB / humanize.KiByte, }, NvmeDevices: storage.NvmeControllers{ storage.MockNvmeController(1), diff --git a/src/control/cmd/daos_server/main.go b/src/control/cmd/daos_server/main.go index 88c21bf5809f..1cb5f1f0d5cb 100644 --- a/src/control/cmd/daos_server/main.go +++ b/src/control/cmd/daos_server/main.go @@ -37,7 +37,7 @@ type mainOpts struct { // TODO(DAOS-3129): This should be -d, but it conflicts with the start // subcommand's -d flag when we default to running it. Debug bool `short:"b" long:"debug" description:"Enable debug output"` - JSON bool `long:"json" short:"j" description:"enable JSON output"` + JSON bool `long:"json" short:"j" description:"Enable JSON output"` JSONLog bool `short:"J" long:"json-logging" description:"Enable JSON-formatted log output"` Syslog bool `long:"syslog" description:"Enable logging to syslog"` diff --git a/src/control/cmd/daos_server/network.go b/src/control/cmd/daos_server/network.go index d619cbc74579..a0b80cb7ae8a 100644 --- a/src/control/cmd/daos_server/network.go +++ b/src/control/cmd/daos_server/network.go @@ -83,7 +83,7 @@ func (cmd *networkScanCmd) Execute(_ []string) error { return err } - ctx := context.Background() + ctx := cmd.MustLogCtx() fs := hwprov.DefaultFabricScanner(cmd.Logger) var prov string diff --git a/src/control/cmd/daos_server/start.go b/src/control/cmd/daos_server/start.go index 962d370db964..f3270059aa4f 100644 --- a/src/control/cmd/daos_server/start.go +++ b/src/control/cmd/daos_server/start.go @@ -7,11 +7,8 @@ package main import ( - "os" - "github.com/pkg/errors" - "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server" @@ -85,72 +82,17 @@ func (cmd *startCmd) setCLIOverrides() error { } func (cmd *startCmd) configureLogging() error { - log, ok := cmd.Logger.(*logging.LeveledLogger) - if !ok { - return errors.New("logger is not a LeveledLogger") - } - - // Set log level mask for default logger from config, - // unless it was explicitly set to debug via CLI flag. - applyLogConfig := func() error { - switch logging.LogLevel(cmd.config.ControlLogMask) { - case logging.LogLevelTrace: - log.SetLevel(logging.LogLevelTrace) - cmd.Debugf("Switching control log level to TRACE") - case logging.LogLevelDebug: - log.SetLevel(logging.LogLevelDebug) - cmd.Debugf("Switching control log level to DEBUG") - case logging.LogLevelNotice: - log.SetLevel(logging.LogLevelNotice) - cmd.Debugf("Switching control log level to NOTICE") - case logging.LogLevelError: - cmd.Debugf("Switching control log level to ERROR") - log.SetLevel(logging.LogLevelError) - } - - if cmd.config.ControlLogJSON { - cmd.Logger = log.WithJSONOutput() - } - - return nil - } - - hostname, err := os.Hostname() - if err != nil { - return err - } - for i, srv := range cmd.config.Engines { if srv.LogFile == "" { cmd.Errorf("no daos log file specified for server %d", i) } } - // Set log file for default logger if specified in config. - if cmd.config.ControlLogFile != "" { - f, err := common.AppendFile(cmd.config.ControlLogFile) - if err != nil { - return errors.WithMessage(err, "create log file") - } - - cmd.Infof("%s logging to file %s", - os.Args[0], cmd.config.ControlLogFile) - - // Create an additional set of loggers which append everything - // to the specified file. - cmd.Logger = log. - WithErrorLogger(logging.NewErrorLogger(hostname, f)). - WithNoticeLogger(logging.NewNoticeLogger(hostname, f)). - WithInfoLogger(logging.NewInfoLogger(hostname, f)). - WithDebugLogger(logging.NewDebugLogger(f)). - WithTraceLogger(logging.NewTraceLogger(f)) - - return applyLogConfig() - } - - cmd.Info("no control log file specified; logging to stdout") - - return applyLogConfig() + return cmdutil.ConfigureLogger(cmd.Logger, cmdutil.LogConfig{ + LogFile: cmd.config.ControlLogFile, + LogLevel: cmd.config.ControlLogMask, + JSON: cmd.config.ControlLogJSON, + }) } func (cmd *startCmd) Execute(args []string) error { diff --git a/src/control/cmd/dmg/auto.go b/src/control/cmd/dmg/auto.go index a85638c14d00..3e97a528cc8a 100644 --- a/src/control/cmd/dmg/auto.go +++ b/src/control/cmd/dmg/auto.go @@ -126,5 +126,5 @@ func (cmd *configGenCmd) confGenPrint(ctx context.Context) error { // parameters suitable to be used across all hosts in provided host list. Use the control API to // generate config from remote scan results. func (cmd *configGenCmd) Execute(_ []string) error { - return cmd.confGenPrint(context.Background()) + return cmd.confGenPrint(cmd.MustLogCtx()) } diff --git a/src/control/cmd/dmg/cont.go b/src/control/cmd/dmg/cont.go index 5ad087b0676f..8906773bd1a8 100644 --- a/src/control/cmd/dmg/cont.go +++ b/src/control/cmd/dmg/cont.go @@ -7,8 +7,6 @@ package main import ( - "context" - "github.com/jessevdk/go-flags" "github.com/pkg/errors" @@ -49,7 +47,7 @@ func (c *ContSetOwnerCmd) Execute(args []string) error { Group: c.GroupName.String(), } - ctx := context.Background() + ctx := c.MustLogCtx() err := control.ContSetOwner(ctx, c.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "FAILED").Error() diff --git a/src/control/cmd/dmg/firmware.go b/src/control/cmd/dmg/firmware.go index 16748d37b32e..4154a980b8dc 100644 --- a/src/control/cmd/dmg/firmware.go +++ b/src/control/cmd/dmg/firmware.go @@ -7,7 +7,6 @@ package main import ( - "context" "io" "strings" @@ -42,7 +41,7 @@ type firmwareQueryCmd struct { // Execute runs the firmware query command. func (cmd *firmwareQueryCmd) Execute(args []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.FirmwareQueryReq{ SCM: cmd.isSCMRequested(), @@ -127,7 +126,7 @@ type firmwareUpdateCmd struct { // Execute runs the firmware update command. func (cmd *firmwareUpdateCmd) Execute(args []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.FirmwareUpdateReq{ FirmwarePath: cmd.FilePath, diff --git a/src/control/cmd/dmg/network.go b/src/control/cmd/dmg/network.go index e55ddadb6356..8ccd25a7a533 100644 --- a/src/control/cmd/dmg/network.go +++ b/src/control/cmd/dmg/network.go @@ -7,7 +7,6 @@ package main import ( - "context" "strings" "github.com/daos-stack/daos/src/control/cmd/dmg/pretty" @@ -32,7 +31,7 @@ type networkScanCmd struct { } func (cmd *networkScanCmd) Execute(_ []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.NetworkScanReq{ Provider: cmd.FabricProvider, } diff --git a/src/control/cmd/dmg/pool.go b/src/control/cmd/dmg/pool.go index 4385859e3267..32bdfdc67236 100644 --- a/src/control/cmd/dmg/pool.go +++ b/src/control/cmd/dmg/pool.go @@ -325,7 +325,7 @@ func (cmd *PoolCreateCmd) Execute(args []string) error { } } - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.PoolCreateReq{ User: cmd.UserName.String(), UserGroup: cmd.GroupName.String(), @@ -408,7 +408,7 @@ func (cmd *PoolListCmd) Execute(_ []string) (errOut error) { NoQuery: cmd.NoQuery, } - initialResp, err := control.ListPools(context.Background(), cmd.ctlInvoker, req) + initialResp, err := control.ListPools(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } @@ -486,7 +486,7 @@ func (cmd *PoolDestroyCmd) Execute(args []string) error { Recursive: cmd.Recursive, } - err := control.PoolDestroy(context.Background(), cmd.ctlInvoker, req) + err := control.PoolDestroy(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -508,7 +508,7 @@ func (cmd *PoolEvictCmd) Execute(args []string) error { req := &control.PoolEvictReq{ID: cmd.PoolID().String()} - err := control.PoolEvict(context.Background(), cmd.ctlInvoker, req) + err := control.PoolEvict(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -536,7 +536,7 @@ func (cmd *PoolExcludeCmd) Execute(args []string) error { req := &control.PoolExcludeReq{ID: cmd.PoolID().String(), Rank: ranklist.Rank(cmd.Rank), Targetidx: idxlist} - err := control.PoolExclude(context.Background(), cmd.ctlInvoker, req) + err := control.PoolExclude(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -565,7 +565,7 @@ func (cmd *PoolDrainCmd) Execute(args []string) error { req := &control.PoolDrainReq{ID: cmd.PoolID().String(), Rank: ranklist.Rank(cmd.Rank), Targetidx: idxlist} - err := control.PoolDrain(context.Background(), cmd.ctlInvoker, req) + err := control.PoolDrain(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -590,7 +590,7 @@ func (cmd *PoolExtendCmd) Execute(args []string) error { Ranks: cmd.RankList.Ranks(), } - err := control.PoolExtend(context.Background(), cmd.ctlInvoker, req) + err := control.PoolExtend(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -623,7 +623,7 @@ func (cmd *PoolReintegrateCmd) Execute(args []string) error { Targetidx: idxlist, } - err := control.PoolReintegrate(context.Background(), cmd.ctlInvoker, req) + err := control.PoolReintegrate(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { msg = errors.WithMessage(err, "failed").Error() } @@ -653,7 +653,7 @@ func (cmd *PoolQueryCmd) Execute(args []string) error { req.IncludeEnabledRanks = cmd.ShowEnabledRanks req.IncludeDisabledRanks = cmd.ShowDisabledRanks - resp, err := control.PoolQuery(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolQuery(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) @@ -693,7 +693,7 @@ func (cmd *PoolQueryTargetsCmd) Execute(args []string) error { Targets: tgtsList, } - resp, err := control.PoolQueryTargets(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolQueryTargets(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) @@ -722,7 +722,7 @@ func (cmd *PoolUpgradeCmd) Execute(args []string) error { ID: cmd.PoolID().String(), } - err := control.PoolUpgrade(context.Background(), cmd.ctlInvoker, req) + err := control.PoolUpgrade(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return errors.Wrap(err, "pool upgrade failed") } @@ -762,7 +762,7 @@ func (cmd *PoolSetPropCmd) Execute(_ []string) error { Properties: cmd.Args.Props.ToSet, } - err := control.PoolSetProp(context.Background(), cmd.ctlInvoker, req) + err := control.PoolSetProp(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(nil, err) } @@ -790,7 +790,7 @@ func (cmd *PoolGetPropCmd) Execute(_ []string) error { Properties: cmd.Args.Props.ToGet, } - resp, err := control.PoolGetProp(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolGetProp(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } @@ -819,7 +819,7 @@ type PoolGetACLCmd struct { func (cmd *PoolGetACLCmd) Execute(args []string) error { req := &control.PoolGetACLReq{ID: cmd.PoolID().String()} - resp, err := control.PoolGetACL(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolGetACL(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } @@ -889,7 +889,7 @@ func (cmd *PoolOverwriteACLCmd) Execute(args []string) error { ACL: acl, } - resp, err := control.PoolOverwriteACL(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolOverwriteACL(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } @@ -937,7 +937,7 @@ func (cmd *PoolUpdateACLCmd) Execute(args []string) error { ACL: acl, } - resp, err := control.PoolUpdateACL(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolUpdateACL(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } @@ -967,7 +967,7 @@ func (cmd *PoolDeleteACLCmd) Execute(args []string) error { Principal: cmd.Principal, } - resp, err := control.PoolDeleteACL(context.Background(), cmd.ctlInvoker, req) + resp, err := control.PoolDeleteACL(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } diff --git a/src/control/cmd/dmg/pretty/storage.go b/src/control/cmd/dmg/pretty/storage.go index 4f2a02fbe9dc..eba4dfc58b67 100644 --- a/src/control/cmd/dmg/pretty/storage.go +++ b/src/control/cmd/dmg/pretty/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -26,7 +26,8 @@ func printHostStorageMapVerbose(hsm control.HostStorageMap, out io.Writer, opts hosts := getPrintHosts(hss.HostSet.RangedString(), opts...) lineBreak := strings.Repeat("-", len(hosts)) fmt.Fprintf(out, "%s\n%s\n%s\n", lineBreak, hosts, lineBreak) - fmt.Fprintf(out, "HugePage Size: %d KB\n", hss.HostStorage.MemInfo.HugepageSizeKiB) + fmt.Fprintf(out, "HugePage Size: %d KB\n\n", + hss.HostStorage.MemInfo.HugepageSizeKiB) if len(hss.HostStorage.ScmNamespaces) == 0 { if err := PrintScmModules(hss.HostStorage.ScmModules, out, opts...); err != nil { return err @@ -179,7 +180,8 @@ func printSmdDevice(dev *storage.SmdDevice, iw io.Writer, opts ...PrintConfigOpt fc := getPrintConfig(opts...) if fc.LEDInfoOnly { - if _, err := fmt.Fprintf(iw, "TrAddr:%s", dev.Ctrlr.PciAddr); err != nil { + if _, err := fmt.Fprintf(iw, "TrAddr:%s NSID:%d", dev.Ctrlr.PciAddr, + dev.CtrlrNamespaceID); err != nil { return err } if dev.UUID != "" { @@ -193,7 +195,8 @@ func printSmdDevice(dev *storage.SmdDevice, iw io.Writer, opts ...PrintConfigOpt return nil } - if _, err := fmt.Fprintf(iw, "UUID:%s [TrAddr:%s]\n", dev.UUID, dev.Ctrlr.PciAddr); err != nil { + if _, err := fmt.Fprintf(iw, "UUID:%s [TrAddr:%s NSID:%d]\n", dev.UUID, dev.Ctrlr.PciAddr, + dev.CtrlrNamespaceID); err != nil { return err } @@ -277,7 +280,7 @@ func PrintSmdInfoMap(omitDevs, omitPools bool, hsm control.HostStorageMap, out i fmt.Fprintln(out) } } else { - fmt.Fprintln(iw, "No pools found") + fmt.Fprintln(iw, "No pools with NVMe found") } } } diff --git a/src/control/cmd/dmg/pretty/storage_nvme.go b/src/control/cmd/dmg/pretty/storage_nvme.go index fefc3eef2852..4086499eabe4 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme.go +++ b/src/control/cmd/dmg/pretty/storage_nvme.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,7 +7,6 @@ package pretty import ( - "errors" "fmt" "io" "sort" @@ -18,6 +17,7 @@ import ( "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/lib/control" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/lib/txtfmt" "github.com/daos-stack/daos/src/control/server/storage" ) @@ -165,32 +165,39 @@ func parseNvmeFormatResults(inResults storage.NvmeControllers) storage.NvmeContr parsedResults := make(storage.NvmeControllers, 0, len(inResults)) for _, result := range inResults { if result.PciAddr != storage.NilBdevAddress { - // ignore skip results parsedResults = append(parsedResults, result) } } - return parsedResults } -func printNvmeFormatResults(devices storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { - if len(devices) == 0 { - fmt.Fprintln(out, "\tNo NVMe devices found") +func printNvmeFormatResults(inCtrlrs storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { + ctrlrs := parseNvmeFormatResults(inCtrlrs) + iw := txtfmt.NewIndentWriter(out) + if len(ctrlrs) == 0 { + fmt.Fprintln(iw, "No NVMe devices were formatted") return nil } pciTitle := "NVMe PCI" resultTitle := "Format Result" + rolesTitle := "Role(s)" - formatter := txtfmt.NewTableFormatter(pciTitle, resultTitle) + formatter := txtfmt.NewTableFormatter(pciTitle, resultTitle, rolesTitle) formatter.InitWriter(out) var table []txtfmt.TableRow - sort.Slice(devices, func(i, j int) bool { return devices[i].PciAddr < devices[j].PciAddr }) + sort.Slice(ctrlrs, func(i, j int) bool { return ctrlrs[i].PciAddr < ctrlrs[j].PciAddr }) - for _, device := range parseNvmeFormatResults(devices) { - row := txtfmt.TableRow{pciTitle: device.PciAddr} - row[resultTitle] = device.Info + for _, c := range ctrlrs { + row := txtfmt.TableRow{pciTitle: c.PciAddr} + row[resultTitle] = c.Info + roles := "NA" + // Assumes that all SMD devices on a controller have the same roles. + if len(c.SmdDevices) > 0 { + roles = fmt.Sprintf("%s", c.SmdDevices[0].Roles.String()) + } + row[rolesTitle] = roles table = append(table, row) } @@ -203,19 +210,22 @@ func printNvmeFormatResults(devices storage.NvmeControllers, out io.Writer, opts func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) + iw := txtfmt.NewIndentWriter(out) if len(controllers) == 0 { - fmt.Fprintln(out, "\tNo NVMe devices found") + fmt.Fprintln(iw, "No NVMe devices found") return w.Err } pciTitle := "NVMe PCI" modelTitle := "Model" fwTitle := "FW Revision" - socketTitle := "Socket ID" + socketTitle := "Socket" capacityTitle := "Capacity" + rolesTitle := "Role(s)" + rankTitle := "Rank" formatter := txtfmt.NewTableFormatter( - pciTitle, modelTitle, fwTitle, socketTitle, capacityTitle, + pciTitle, modelTitle, fwTitle, socketTitle, capacityTitle, rolesTitle, rankTitle, ) formatter.InitWriter(out) var table []txtfmt.TableRow @@ -228,6 +238,18 @@ func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, op row[fwTitle] = ctrlr.FwRev row[socketTitle] = fmt.Sprint(ctrlr.SocketID) row[capacityTitle] = humanize.Bytes(ctrlr.Capacity()) + roles := "NA" + rank := "None" + // Assumes that all SMD devices on a controller have the same roles and rank. + if len(ctrlr.SmdDevices) > 0 { + sd := ctrlr.SmdDevices[0] + roles = sd.Roles.String() + if sd.Rank != ranklist.NilRank { + rank = sd.Rank.String() + } + } + row[rolesTitle] = roles + row[rankTitle] = rank table = append(table, row) } @@ -266,50 +288,3 @@ func PrintNvmeHealthMap(hsm control.HostStorageMap, out io.Writer, opts ...Print return w.Err } - -// PrintNvmeMetaMap generates a human-readable representation of the supplied -// HostStorageMap, with a focus on presenting the NVMe Device Server Meta Data. -func PrintNvmeMetaMap(hsm control.HostStorageMap, out io.Writer, opts ...PrintConfigOption) error { - w := txtfmt.NewErrWriter(out) - - for _, key := range hsm.Keys() { - hss := hsm[key] - hosts := getPrintHosts(hss.HostSet.RangedString(), opts...) - lineBreak := strings.Repeat("-", len(hosts)) - fmt.Fprintf(out, "%s\n%s\n%s\n", lineBreak, hosts, lineBreak) - - if len(hss.HostStorage.NvmeDevices) == 0 { - fmt.Fprintln(out, " No NVMe devices detected") - continue - } - - for _, controller := range hss.HostStorage.NvmeDevices { - if controller == nil { - return errors.New("nil controller in NvmeDevices") - } - if err := printNvmeControllerSummary(controller, out, opts...); err != nil { - return err - } - iw := txtfmt.NewIndentWriter(out) - if len(controller.SmdDevices) > 0 { - fmt.Fprintln(iw, "SMD Devices") - - for _, device := range controller.SmdDevices { - iw1 := txtfmt.NewIndentWriter(iw) - - // Attach parent controller details to SMD before printing. - device.Ctrlr = *controller - - if err := printSmdDevice(device, iw1, opts...); err != nil { - return err - } - } - } else { - fmt.Fprintln(iw, "No SMD devices found") - } - fmt.Fprintln(out) - } - } - - return w.Err -} diff --git a/src/control/cmd/dmg/pretty/storage_nvme_test.go b/src/control/cmd/dmg/pretty/storage_nvme_test.go index c75b0061e142..08dc6b8426df 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme_test.go +++ b/src/control/cmd/dmg/pretty/storage_nvme_test.go @@ -17,10 +17,24 @@ import ( "github.com/google/go-cmp/cmp" "github.com/daos-stack/daos/src/control/lib/control" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/server/storage" ) func TestPretty_PrintNVMeController(t *testing.T) { + ctrlrWithSmd := func(idx int32, roleBits int) *storage.NvmeController { + c := storage.MockNvmeController(idx) + sd := storage.MockSmdDevice(nil, idx) + sd.Roles = storage.BdevRoles{storage.OptionBits(roleBits)} + sd.Rank = ranklist.Rank(idx) + c.SmdDevices = []*storage.SmdDevice{sd} + return c + } + ctrlrWithNilRank := func(idx int32) *storage.NvmeController { + c := ctrlrWithSmd(idx, 0) + c.SmdDevices[0].Rank = ranklist.NilRank + return c + } for name, tc := range map[string]struct { devices storage.NvmeControllers expPrintStr string @@ -31,10 +45,10 @@ func TestPretty_PrintNVMeController(t *testing.T) { storage.MockNvmeController(2), }, expPrintStr: ` -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB -0000:02:00.0 model-2 fwRev-2 0 2.0 TB +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA None +0000:02:00.0 model-2 fwRev-2 0 2.0 TB NA None `, }, "vmd backing devices": { @@ -43,10 +57,46 @@ NVMe PCI Model FW Revision Socket ID Capacity &storage.NvmeController{PciAddr: "050505:03:00.0"}, }, expPrintStr: ` -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -050505:01:00.0 0 0 B -050505:03:00.0 0 0 B +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +050505:01:00.0 0 0 B NA None +050505:03:00.0 0 0 B NA None +`, + }, + "controllers with roles": { + devices: storage.NvmeControllers{ + ctrlrWithSmd(1, 1), + ctrlrWithSmd(2, 6), + }, + expPrintStr: ` +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB data 1 +0000:02:00.0 model-2 fwRev-2 0 2.0 TB meta,wal 2 +`, + }, + "controllers with no roles": { + devices: storage.NvmeControllers{ + ctrlrWithSmd(1, 0), + ctrlrWithSmd(2, 0), + }, + expPrintStr: ` +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 1 +0000:02:00.0 model-2 fwRev-2 0 2.0 TB NA 2 +`, + }, + "controllers with no rank": { + devices: storage.NvmeControllers{ + ctrlrWithNilRank(1), + ctrlrWithNilRank(2), + }, + expPrintStr: ` +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA None +0000:02:00.0 model-2 fwRev-2 0 2.0 TB NA None `, }, } { @@ -327,172 +377,3 @@ PCI:%s Model:%s FW:%s Socket:%d Capacity:%s }) } } - -func TestPretty_PrintNVMetaMap(t *testing.T) { - mockNvmeController := func(idx int32) *storage.NvmeController { - c := storage.MockNvmeController(idx) - c.SmdDevices = []*storage.SmdDevice{ - storage.MockSmdDevice(nil, idx), - } - return c - } - var ( - controllerA = mockNvmeController(1) - controllerB = mockNvmeController(2) - controllerC = mockNvmeController(1) - controllerD = mockNvmeController(2) - controllerE = mockNvmeController(1) - controllerF = mockNvmeController(2) - ) - controllerA.SmdDevices = nil - controllerB.SmdDevices = nil - controllerE.SmdDevices = []*storage.SmdDevice{ - storage.MockSmdDevice(nil, 0), - storage.MockSmdDevice(nil, 1), - } - controllerF.SmdDevices = []*storage.SmdDevice{ - storage.MockSmdDevice(nil, 2), - storage.MockSmdDevice(nil, 3), - } - for name, tc := range map[string]struct { - hsm control.HostStorageMap - expPrintStr string - }{ - "no controllers": { - hsm: mockHostStorageMap(t, &mockHostStorage{"host1", &control.HostStorage{}}), - expPrintStr: ` ------ -host1 ------ - No NVMe devices detected -`, - }, - "no smd devices on controllers": { - hsm: mockHostStorageMap(t, - &mockHostStorage{ - "host1", - &control.HostStorage{ - NvmeDevices: storage.NvmeControllers{ - controllerA, controllerB, - }, - }, - }, - ), - expPrintStr: fmt.Sprintf(` ------ -host1 ------ -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - No SMD devices found - -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - No SMD devices found - -`, - controllerA.PciAddr, controllerA.Model, controllerA.FwRev, - controllerA.SocketID, humanize.Bytes(controllerA.Capacity()), - controllerB.PciAddr, controllerB.Model, controllerB.FwRev, - controllerB.SocketID, humanize.Bytes(controllerB.Capacity())), - }, - "single smd device on each controller": { - hsm: mockHostStorageMap(t, - &mockHostStorage{ - "host1", - &control.HostStorage{ - NvmeDevices: storage.NvmeControllers{ - controllerC, controllerD, - }, - }, - }, - ), - expPrintStr: fmt.Sprintf(` ------ -host1 ------ -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - SMD Devices - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - SMD Devices - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - -`, - controllerC.PciAddr, controllerC.Model, controllerC.FwRev, - controllerC.SocketID, humanize.Bytes(controllerC.Capacity()), - controllerC.SmdDevices[0].UUID, controllerC.PciAddr, - controllerC.SmdDevices[0].TargetIDs, - controllerC.SmdDevices[0].Rank, - controllerC.NvmeState, controllerC.LedState, - - controllerD.PciAddr, controllerD.Model, controllerD.FwRev, - controllerD.SocketID, humanize.Bytes(controllerD.Capacity()), - controllerD.SmdDevices[0].UUID, controllerD.PciAddr, - controllerD.SmdDevices[0].TargetIDs, - controllerD.SmdDevices[0].Rank, - controllerD.NvmeState, controllerD.LedState), - }, - "multiple smd devices on each controller": { - hsm: mockHostStorageMap(t, - &mockHostStorage{ - "host1", - &control.HostStorage{ - NvmeDevices: storage.NvmeControllers{ - controllerE, - controllerF, - }, - }, - }, - ), - expPrintStr: fmt.Sprintf(` ------ -host1 ------ -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - SMD Devices - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - -PCI:%s Model:%s FW:%s Socket:%d Capacity:%s - SMD Devices - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - UUID:%s [TrAddr:%s] - Roles:data,meta,wal Targets:%v Rank:%d State:%s LED:%s - -`, - controllerE.PciAddr, controllerE.Model, controllerE.FwRev, - controllerE.SocketID, humanize.Bytes(controllerE.Capacity()), - controllerE.SmdDevices[0].UUID, controllerE.PciAddr, - controllerE.SmdDevices[0].TargetIDs, controllerE.SmdDevices[0].Rank, - controllerE.NvmeState, controllerE.LedState, - controllerE.SmdDevices[1].UUID, controllerE.PciAddr, - controllerE.SmdDevices[1].TargetIDs, controllerE.SmdDevices[1].Rank, - controllerE.NvmeState, controllerE.LedState, - - controllerF.PciAddr, controllerF.Model, controllerF.FwRev, - controllerF.SocketID, humanize.Bytes(controllerF.Capacity()), - controllerF.SmdDevices[0].UUID, controllerF.PciAddr, - controllerF.SmdDevices[0].TargetIDs, controllerF.SmdDevices[0].Rank, - controllerF.NvmeState, controllerF.LedState, - controllerF.SmdDevices[1].UUID, controllerF.PciAddr, - controllerF.SmdDevices[1].TargetIDs, controllerF.SmdDevices[1].Rank, - controllerF.NvmeState, controllerF.LedState), - }, - } { - t.Run(name, func(t *testing.T) { - var bld strings.Builder - if err := PrintNvmeMetaMap(tc.hsm, &bld); err != nil { - t.Fatal(err) - } - - if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { - t.Fatalf("unexpected print output (-want, +got):\n%s\n", diff) - } - }) - } -} diff --git a/src/control/cmd/dmg/pretty/storage_scm.go b/src/control/cmd/dmg/pretty/storage_scm.go index 62123550d9f8..bfc2559421f7 100644 --- a/src/control/cmd/dmg/pretty/storage_scm.go +++ b/src/control/cmd/dmg/pretty/storage_scm.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2021 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -18,8 +18,9 @@ import ( ) func printScmMountPoints(mountpoints storage.ScmMountPoints, out io.Writer, opts ...PrintConfigOption) error { + iw := txtfmt.NewIndentWriter(out) if len(mountpoints) == 0 { - fmt.Fprintln(out, "\tNo SCM mount results") + fmt.Fprintln(iw, "No SCM mount results") return nil } @@ -48,16 +49,16 @@ func printScmMountPoints(mountpoints storage.ScmMountPoints, out io.Writer, opts // TODO: un-export function when not needed in cmd/daos_server/storage.go func PrintScmModules(modules storage.ScmModules, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) - + iw := txtfmt.NewIndentWriter(out) if len(modules) == 0 { - fmt.Fprintln(out, "\tNo SCM modules found") + fmt.Fprintln(iw, "No SCM modules found") return w.Err } - physicalIdTitle := "SCM Module ID" - socketTitle := "Socket ID" - memCtrlrTitle := "Memory Ctrlr ID" - channelTitle := "Channel ID" + physicalIdTitle := "SCM Module" + socketTitle := "Socket" + memCtrlrTitle := "Memory Ctrlr" + channelTitle := "Channel" slotTitle := "Channel Slot" capacityTitle := "Capacity" @@ -89,14 +90,14 @@ func PrintScmModules(modules storage.ScmModules, out io.Writer, opts ...PrintCon // TODO: un-export function when not needed in cmd/daos_server/storage.go func PrintScmNamespaces(namespaces storage.ScmNamespaces, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) - + iw := txtfmt.NewIndentWriter(out) if len(namespaces) == 0 { - fmt.Fprintln(out, "\tNo SCM namespaces found") + fmt.Fprintln(iw, "No SCM namespaces found") return w.Err } deviceTitle := "SCM Namespace" - socketTitle := "Socket ID" + socketTitle := "Socket" capacityTitle := "Capacity" formatter := txtfmt.NewTableFormatter(deviceTitle, socketTitle, capacityTitle) diff --git a/src/control/cmd/dmg/pretty/storage_test.go b/src/control/cmd/dmg/pretty/storage_test.go index 724031035ee3..22ddd4a17a18 100644 --- a/src/control/cmd/dmg/pretty/storage_test.go +++ b/src/control/cmd/dmg/pretty/storage_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -472,11 +472,12 @@ Errors: host1 ----- HugePage Size: 2048 KB - No SCM modules found -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB + No SCM modules found + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -501,11 +502,12 @@ Errors: host1 ----- HugePage Size: 2048 KB -SCM Module ID Socket ID Memory Ctrlr ID Channel ID Channel Slot Capacity -------------- --------- --------------- ---------- ------------ -------- -1 1 1 1 1 954 MiB - No NVMe devices found +SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity +---------- ------ ------------ ------- ------------ -------- +1 1 1 1 1 954 MiB + + No NVMe devices found `, }, @@ -535,9 +537,10 @@ Errors: host[1-2] --------- HugePage Size: 2048 KB - No SCM modules found - No NVMe devices found + No SCM modules found + + No NVMe devices found `, }, @@ -561,9 +564,10 @@ HugePage Size: 2048 KB host[1-2] --------- HugePage Size: 2048 KB - No SCM modules found - No NVMe devices found + No SCM modules found + + No NVMe devices found `, }, @@ -583,13 +587,14 @@ HugePage Size: 2048 KB host1 ----- HugePage Size: 2048 KB -SCM Module ID Socket ID Memory Ctrlr ID Channel ID Channel Slot Capacity -------------- --------- --------------- ---------- ------------ -------- -1 1 1 1 1 954 MiB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB +SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity +---------- ------ ------------ ------- ------------ -------- +1 1 1 1 1 954 MiB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -609,13 +614,14 @@ NVMe PCI Model FW Revision Socket ID Capacity host1 ----- HugePage Size: 2048 KB -SCM Namespace Socket ID Capacity -------------- --------- -------- -pmem0 0 1.0 TB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB +SCM Namespace Socket Capacity +------------- ------ -------- +pmem0 0 1.0 TB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -639,13 +645,14 @@ NVMe PCI Model FW Revision Socket ID Capacity host[1-2] --------- HugePage Size: 2048 KB -SCM Module ID Socket ID Memory Ctrlr ID Channel ID Channel Slot Capacity -------------- --------- --------------- ---------- ------------ -------- -1 1 1 1 1 954 MiB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB +SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity +---------- ------ ------------ ------- ------------ -------- +1 1 1 1 1 954 MiB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -669,21 +676,23 @@ NVMe PCI Model FW Revision Socket ID Capacity host1 ----- HugePage Size: 2048 KB -SCM Module ID Socket ID Memory Ctrlr ID Channel ID Channel Slot Capacity -------------- --------- --------------- ---------- ------------ -------- -1 1 1 1 1 954 MiB - No NVMe devices found +SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity +---------- ------ ------------ ------- ------------ -------- +1 1 1 1 1 954 MiB + + No NVMe devices found ----- host2 ----- HugePage Size: 2048 KB - No SCM modules found -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB + No SCM modules found + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -699,13 +708,14 @@ NVMe PCI Model FW Revision Socket ID Capacity host[0-1023] ------------ HugePage Size: 2048 KB -SCM Module ID Socket ID Memory Ctrlr ID Channel ID Channel Slot Capacity -------------- --------- --------------- ---------- ------------ -------- -1 1 1 1 1 954 MiB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB +SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity +---------- ------ ------------ ------- ------------ -------- +1 1 1 1 1 954 MiB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -737,11 +747,12 @@ NVMe PCI Model FW Revision Socket ID Capacity host-[0001-0004] ---------------- HugePage Size: 2048 KB - No SCM modules found -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB + No SCM modules found + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -773,11 +784,12 @@ NVMe PCI Model FW Revision Socket ID Capacity host-j-[0001-0004] ------------------ HugePage Size: 2048 KB - No SCM modules found -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 model-1 fwRev-1 1 2.0 TB + No SCM modules found + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 model-1 fwRev-1 1 2.0 TB NA 0 `, }, @@ -809,29 +821,31 @@ NVMe PCI Model FW Revision Socket ID Capacity host[1,3] --------- HugePage Size: 2048 KB -SCM Namespace Socket ID Capacity -------------- --------- -------- -pmem0 0 1.0 TB -pmem1 1 2.0 TB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 1 2.0 TB -0000:04:00.0 0 2.0 TB +SCM Namespace Socket Capacity +------------- ------ -------- +pmem0 0 1.0 TB +pmem1 1 2.0 TB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 1 2.0 TB data,meta,wal 0 +0000:04:00.0 0 2.0 TB data,meta,wal 0 --------- host[2,4] --------- HugePage Size: 2048 KB -SCM Namespace Socket ID Capacity -------------- --------- -------- -pmem0 0 1.0 TB -pmem1 1 2.0 TB -NVMe PCI Model FW Revision Socket ID Capacity --------- ----- ----------- --------- -------- -0000:01:00.0 1 2.1 TB -0000:04:00.0 0 2.1 TB +SCM Namespace Socket Capacity +------------- ------ -------- +pmem0 0 1.0 TB +pmem1 1 2.0 TB + +NVMe PCI Model FW Revision Socket Capacity Role(s) Rank +-------- ----- ----------- ------ -------- ------- ---- +0000:01:00.0 1 2.1 TB data,meta,wal 0 +0000:04:00.0 0 2.1 TB data,meta,wal 0 `, }, @@ -1058,6 +1072,41 @@ Format Summary: Hosts SCM Devices NVMe Devices ----- ----------- ------------ host1 2 2 +`, + }, + "1 SCM, NVMe skipped": { + resp: &control.StorageFormatResp{ + HostErrorsResp: control.HostErrorsResp{ + HostErrors: make(control.HostErrorsMap), + }, + HostStorage: func() control.HostStorageMap { + hsm := make(control.HostStorageMap) + hs := &control.HostStorage{ + ScmMountPoints: []*storage.ScmMountPoint{ + { + Info: "success", + Path: "/mnt/0", + }, + }, + NvmeDevices: []*storage.NvmeController{ + { + Info: "skipping", + PciAddr: storage.NilBdevAddress, + }, + }, + } + if err := hsm.Add("host1", hs); err != nil { + t.Fatal(err) + } + return hsm + }(), + }, + expPrintStr: ` + +Format Summary: + Hosts SCM Devices NVMe Devices + ----- ----------- ------------ + host1 1 0 `, }, "2 Hosts, 2 SCM, 2 NVMe; first SCM fails": { @@ -1151,9 +1200,9 @@ SCM Mount Format Result --------- ------------- /mnt/2 CTL_SUCCESS -NVMe PCI Format Result --------- ------------- -2 CTL_SUCCESS +NVMe PCI Format Result Role(s) +-------- ------------- ------- +2 CTL_SUCCESS NA `, }, @@ -1178,9 +1227,9 @@ SCM Mount Format Result /mnt/1 CTL_SUCCESS /mnt/2 CTL_SUCCESS -NVMe PCI Format Result --------- ------------- -1 CTL_SUCCESS +NVMe PCI Format Result Role(s) +-------- ------------- ------- +1 CTL_SUCCESS NA `, }, @@ -1200,10 +1249,10 @@ SCM Mount Format Result /mnt/1 CTL_SUCCESS /mnt/2 CTL_SUCCESS -NVMe PCI Format Result --------- ------------- -1 CTL_SUCCESS -2 CTL_SUCCESS +NVMe PCI Format Result Role(s) +-------- ------------- ------- +1 CTL_SUCCESS NA +2 CTL_SUCCESS NA `, }, @@ -1227,9 +1276,9 @@ SCM Mount Format Result --------- ------------- /mnt/2 CTL_SUCCESS -NVMe PCI Format Result --------- ------------- -2 CTL_SUCCESS +NVMe PCI Format Result Role(s) +-------- ------------- ------- +2 CTL_SUCCESS NA `, }, @@ -1249,10 +1298,74 @@ SCM Mount Format Result /mnt/1 CTL_SUCCESS /mnt/2 CTL_SUCCESS -NVMe PCI Format Result --------- ------------- -1 CTL_SUCCESS -2 CTL_SUCCESS +NVMe PCI Format Result Role(s) +-------- ------------- ------- +1 CTL_SUCCESS NA +2 CTL_SUCCESS NA + +`, + }, + "2 Hosts, 2 SCM, 2 NVMe; MD-on-SSD roles": { + resp: control.MockFormatResp(t, control.MockFormatConf{ + Hosts: 2, + ScmPerHost: 2, + NvmePerHost: 2, + NvmeRoleBits: int(storage.BdevRoleAll), + }), + expPrintStr: ` + +--------- +host[1-2] +--------- +SCM Mount Format Result +--------- ------------- +/mnt/1 CTL_SUCCESS +/mnt/2 CTL_SUCCESS + +NVMe PCI Format Result Role(s) +-------- ------------- ------- +1 CTL_SUCCESS data,meta,wal +2 CTL_SUCCESS data,meta,wal + +`, + }, + "1 SCM, NVMe skipped": { + resp: &control.StorageFormatResp{ + HostErrorsResp: control.HostErrorsResp{ + HostErrors: make(control.HostErrorsMap), + }, + HostStorage: func() control.HostStorageMap { + hsm := make(control.HostStorageMap) + hs := &control.HostStorage{ + ScmMountPoints: []*storage.ScmMountPoint{ + { + Info: "CTL_SUCCESS", + Path: "/mnt/0", + }, + }, + NvmeDevices: []*storage.NvmeController{ + { + Info: "skipping", + PciAddr: storage.NilBdevAddress, + }, + }, + } + if err := hsm.Add("host1", hs); err != nil { + t.Fatal(err) + } + return hsm + }(), + }, + expPrintStr: ` + +----- +host1 +----- +SCM Mount Format Result +--------- ------------- +/mnt/0 CTL_SUCCESS + + No NVMe devices were formatted `, }, @@ -1393,7 +1506,7 @@ host1 ----- host1 ----- - No pools found + No pools with NVMe found `, }, "list-devices": { @@ -1442,13 +1555,13 @@ host1 host1 ----- Devices - UUID:00000000-0000-0000-0000-000000000000 [TrAddr:0000:8a:00.0] + UUID:00000000-0000-0000-0000-000000000000 [TrAddr:0000:8a:00.0 NSID:0] Roles:wal SysXS Targets:[0 1 2] Rank:0 State:NEW LED:OFF - UUID:00000001-0001-0001-0001-000000000001 [TrAddr:0000:8b:00.0] + UUID:00000001-0001-0001-0001-000000000001 [TrAddr:0000:8b:00.0 NSID:0] Roles:data,meta Targets:[3 4 5] Rank:0 State:EVICTED LED:ON - UUID:00000002-0002-0002-0002-000000000002 [TrAddr:0000:da:00.0] + UUID:00000002-0002-0002-0002-000000000002 [TrAddr:0000:da:00.0 NSID:0] Roles:wal SysXS Targets:[0 1 2] Rank:1 State:UNKNOWN LED:NA - UUID:00000003-0003-0003-0003-000000000003 [TrAddr:0000:db:00.0] + UUID:00000003-0003-0003-0003-000000000003 [TrAddr:0000:db:00.0 NSID:0] Roles:data,meta Targets:[3 4 5] Rank:1 State:NORMAL LED:QUICK_BLINK `, }, @@ -1478,11 +1591,12 @@ host1 SmdInfo: &control.SmdInfo{ Devices: []*storage.SmdDevice{ { - UUID: test.MockUUID(0), - TargetIDs: []int32{0, 1, 2}, - Rank: 0, - Ctrlr: *mockController, - Roles: storage.BdevRoles{storage.BdevRoleAll}, + UUID: test.MockUUID(0), + TargetIDs: []int32{0, 1, 2}, + Rank: 0, + Ctrlr: *mockController, + CtrlrNamespaceID: 1, + Roles: storage.BdevRoles{storage.BdevRoleAll}, }, }, }, @@ -1494,7 +1608,7 @@ host1 host1 ----- Devices - UUID:00000000-0000-0000-0000-000000000000 [TrAddr:0000:01:00.0] + UUID:00000000-0000-0000-0000-000000000000 [TrAddr:0000:01:00.0 NSID:1] Roles:data,meta,wal Targets:[0 1 2] Rank:0 State:NORMAL LED:OFF Health Stats: Temperature:%dK(%.02fC) @@ -1579,7 +1693,7 @@ host1 host1 ----- Devices - TrAddr:0000:db:00.0 [UUID:842c739b-86b5-462f-a7ba-b4a91b674f3d] LED:QUICK_BLINK + TrAddr:0000:db:00.0 NSID:0 [UUID:842c739b-86b5-462f-a7ba-b4a91b674f3d] LED:QUICK_BLINK `, }, "identify led; no uuid specified": { @@ -1592,7 +1706,8 @@ host1 SmdInfo: &control.SmdInfo{ Devices: []*storage.SmdDevice{ { - Ctrlr: identCtrlr, + Ctrlr: identCtrlr, + CtrlrNamespaceID: 1, }, }, }, @@ -1604,7 +1719,7 @@ host1 host1 ----- Devices - TrAddr:0000:db:00.0 LED:QUICK_BLINK + TrAddr:0000:db:00.0 NSID:1 LED:QUICK_BLINK `, }, } { diff --git a/src/control/cmd/dmg/server.go b/src/control/cmd/dmg/server.go index aa4a6e91f220..7f61b6e165ba 100644 --- a/src/control/cmd/dmg/server.go +++ b/src/control/cmd/dmg/server.go @@ -7,7 +7,6 @@ package main import ( - "context" "strings" "github.com/pkg/errors" @@ -49,7 +48,7 @@ func (cmd *serverSetLogMasksCmd) Execute(_ []string) (errOut error) { cmd.Debugf("set log masks request: %+v", req) - resp, err := control.SetEngineLogMasks(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SetEngineLogMasks(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } diff --git a/src/control/cmd/dmg/storage.go b/src/control/cmd/dmg/storage.go index fea3160f74ca..447a3ec46ec5 100644 --- a/src/control/cmd/dmg/storage.go +++ b/src/control/cmd/dmg/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2019-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,7 +7,6 @@ package main import ( - "context" "strings" "github.com/pkg/errors" @@ -37,31 +36,26 @@ type storageScanCmd struct { cmdutil.JSONOutputCmd Verbose bool `short:"v" long:"verbose" description:"List SCM & NVMe device details"` NvmeHealth bool `short:"n" long:"nvme-health" description:"Display NVMe device health statistics"` - NvmeMeta bool `short:"m" long:"nvme-meta" description:"Display server meta data held on NVMe storage"` } // Execute is run when storageScanCmd activates. // // Runs NVMe and SCM storage scan on all connected servers. func (cmd *storageScanCmd) Execute(_ []string) error { - if cmd.NvmeHealth && cmd.NvmeMeta { - return errors.New("cannot use --nvme-health and --nvme-meta together") - } - if cmd.Verbose && (cmd.NvmeHealth || cmd.NvmeMeta) { - return errors.New("cannot use --verbose with --nvme-health or --nvme-meta") + if cmd.Verbose && cmd.NvmeHealth { + return errors.New("cannot use --verbose with --nvme-health") } req := &control.StorageScanReq{ NvmeHealth: cmd.NvmeHealth, - NvmeMeta: cmd.NvmeMeta, - // don't strip nvme details if verbose or health or meta set - NvmeBasic: !(cmd.Verbose || cmd.NvmeHealth || cmd.NvmeMeta), + // Strip nvme details if verbose and health flags are unset. + NvmeBasic: !(cmd.Verbose || cmd.NvmeHealth), } req.SetHostList(cmd.getHostList()) cmd.Debugf("storage scan request: %+v", req) - resp, err := control.StorageScan(context.Background(), cmd.ctlInvoker, req) + resp, err := control.StorageScan(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err } @@ -81,16 +75,11 @@ func (cmd *storageScanCmd) Execute(_ []string) error { } var out strings.Builder - switch { - case cmd.NvmeHealth: + if cmd.NvmeHealth { if err := pretty.PrintNvmeHealthMap(resp.HostStorage, &out); err != nil { return err } - case cmd.NvmeMeta: - if err := pretty.PrintNvmeMetaMap(resp.HostStorage, &out); err != nil { - return err - } - default: + } else { verbose := pretty.PrintWithVerboseOutput(cmd.Verbose) if err := pretty.PrintHostStorageMap(resp.HostStorage, &out, verbose); err != nil { return err @@ -115,7 +104,7 @@ type storageFormatCmd struct { // // Run NVMe and SCM storage format on all connected servers. func (cmd *storageFormatCmd) Execute(args []string) (err error) { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.StorageFormatReq{Reformat: cmd.Force} req.SetHostList(cmd.getHostList()) @@ -164,7 +153,7 @@ type nvmeRebindCmd struct { // // Rebind NVMe SSD from kernel driver and bind to user-space driver on single server. func (cmd *nvmeRebindCmd) Execute(args []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() if len(cmd.getHostList()) != 1 { return errors.New("command expects a single host in hostlist") @@ -213,7 +202,7 @@ type nvmeAddDeviceCmd struct { // // Add recently inserted NVMe SSD to a running engine by updating relevant NVMe config file. func (cmd *nvmeAddDeviceCmd) Execute(args []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() if len(cmd.getHostList()) != 1 { return errors.New("command expects a single host in hostlist") diff --git a/src/control/cmd/dmg/storage_query.go b/src/control/cmd/dmg/storage_query.go index b02475a67f89..c05a2437e87b 100644 --- a/src/control/cmd/dmg/storage_query.go +++ b/src/control/cmd/dmg/storage_query.go @@ -64,7 +64,7 @@ func (cmd *smdQueryCmd) makeRequest(ctx context.Context, req *control.SmdQueryRe // storageQueryCmd is the struct representing the storage query subcommand type storageQueryCmd struct { DeviceHealth devHealthQueryCmd `command:"device-health" description:"Query the device health"` - ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools on the server"` + ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools with NVMe on the server"` ListDevices listDevicesQueryCmd `command:"list-devices" description:"List storage devices on the server"` Usage usageQueryCmd `command:"usage" description:"Show SCM & NVMe storage space utilization per storage server"` } @@ -75,7 +75,7 @@ type devHealthQueryCmd struct { } func (cmd *devHealthQueryCmd) Execute(_ []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.SmdQueryReq{ OmitPools: true, IncludeBioHealth: true, @@ -94,7 +94,7 @@ type listDevicesQueryCmd struct { } func (cmd *listDevicesQueryCmd) Execute(_ []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.SmdQueryReq{ OmitPools: true, @@ -114,7 +114,7 @@ type listPoolsQueryCmd struct { } func (cmd *listPoolsQueryCmd) Execute(_ []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.SmdQueryReq{ OmitDevices: true, Rank: cmd.GetRank(), @@ -135,7 +135,7 @@ type usageQueryCmd struct { // // Queries NVMe and SCM usage on hosts. func (cmd *usageQueryCmd) Execute(_ []string) error { - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.StorageScanReq{Usage: true} req.SetHostList(cmd.getHostList()) resp, err := control.StorageScan(ctx, cmd.ctlInvoker, req) @@ -217,7 +217,7 @@ func (cmd *nvmeSetFaultyCmd) Execute(_ []string) error { Operation: control.SetFaultyOp, IDs: cmd.UUID, } - return cmd.makeRequest(context.Background(), req) + return cmd.makeRequest(cmd.MustLogCtx(), req) } // storageReplaceCmd is the struct representing the replace storage subcommand @@ -251,7 +251,7 @@ func (cmd *nvmeReplaceCmd) Execute(_ []string) error { ReplaceUUID: cmd.NewDevUUID, ReplaceNoReint: cmd.NoReint, } - return cmd.makeRequest(context.Background(), req) + return cmd.makeRequest(cmd.MustLogCtx(), req) } type ledCmd struct { @@ -291,7 +291,7 @@ func (cmd *ledIdentifyCmd) Execute(_ []string) error { } req.Operation = control.LedResetOp } - return cmd.makeRequest(context.Background(), req, pretty.PrintOnlyLEDInfo()) + return cmd.makeRequest(cmd.MustLogCtx(), req, pretty.PrintOnlyLEDInfo()) } type ledCheckCmd struct { @@ -309,5 +309,5 @@ func (cmd *ledCheckCmd) Execute(_ []string) error { Operation: control.LedCheckOp, IDs: cmd.Args.IDs, } - return cmd.makeRequest(context.Background(), req, pretty.PrintOnlyLEDInfo()) + return cmd.makeRequest(cmd.MustLogCtx(), req, pretty.PrintOnlyLEDInfo()) } diff --git a/src/control/cmd/dmg/storage_test.go b/src/control/cmd/dmg/storage_test.go index 300747fa285d..124b46e984a2 100644 --- a/src/control/cmd/dmg/storage_test.go +++ b/src/control/cmd/dmg/storage_test.go @@ -88,30 +88,6 @@ func TestStorageCommands(t *testing.T) { "", errors.New("cannot use --verbose"), }, - { - "Scan NVMe meta data short", - "storage scan -m", - printRequest(t, &control.StorageScanReq{NvmeMeta: true}), - nil, - }, - { - "Scan NVMe meta data long", - "storage scan --nvme-meta", - printRequest(t, &control.StorageScanReq{NvmeMeta: true}), - nil, - }, - { - "Scan NVMe meta with verbose", - "storage scan --nvme-meta --verbose", - "", - errors.New("cannot use --verbose"), - }, - { - "Scan NVMe meta and health", - "storage scan --nvme-meta --nvme-health --verbose", - "", - errors.New("cannot use --nvme-health and --nvme-meta"), - }, { "Rebind NVMe; no PCI address", "storage nvme-rebind", diff --git a/src/control/cmd/dmg/support.go b/src/control/cmd/dmg/support.go index c18aa28966a1..c312c9036351 100644 --- a/src/control/cmd/dmg/support.go +++ b/src/control/cmd/dmg/support.go @@ -7,7 +7,6 @@ package main import ( - "context" "fmt" "os" "path/filepath" @@ -49,7 +48,7 @@ func (cmd *collectLogCmd) rsyncLog() error { LogFunction: support.RsyncLogEnum, } cmd.Debugf("Rsync logs from servers to %s:%s ", hostName, cmd.TargetFolder) - resp, err := control.CollectLog(context.Background(), cmd.ctlInvoker, req) + resp, err := control.CollectLog(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil && cmd.Stop { return err } @@ -76,7 +75,7 @@ func (cmd *collectLogCmd) archLogsOnServer() error { LogFunction: support.ArchiveLogsEnum, } cmd.Debugf("Archiving the Log Folder %s", cmd.TargetFolder) - resp, err := control.CollectLog(context.Background(), cmd.ctlInvoker, req) + resp, err := control.CollectLog(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil && cmd.Stop { return err } @@ -152,7 +151,7 @@ func (cmd *collectLogCmd) Execute(_ []string) error { for logFunc, logCmdSet := range LogCollection { for _, logCmd := range logCmdSet { cmd.Debugf("Log Function %d -- Log Collect Cmd %s ", logFunc, logCmd) - ctx := context.Background() + ctx := cmd.MustLogCtx() req := &control.CollectLogReq{ TargetFolder: cmd.TargetFolder, ExtraLogsDir: cmd.ExtraLogsDir, diff --git a/src/control/cmd/dmg/system.go b/src/control/cmd/dmg/system.go index 87a9e46c75e2..376b1c4e5f82 100644 --- a/src/control/cmd/dmg/system.go +++ b/src/control/cmd/dmg/system.go @@ -7,7 +7,6 @@ package main import ( - "context" "fmt" "io" "strings" @@ -59,7 +58,7 @@ func (cmd *leaderQueryCmd) Execute(_ []string) (errOut error) { return errors.New("no configuration loaded") } - ctx := context.Background() + ctx := cmd.MustLogCtx() req := new(control.LeaderQueryReq) resp, err := control.LeaderQuery(ctx, cmd.ctlInvoker, req) @@ -129,7 +128,7 @@ func (cmd *systemQueryCmd) Execute(_ []string) (errOut error) { req.NotOK = cmd.NotOK req.WantedStates = cmd.WantedStates.States - resp, err := control.SystemQuery(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemQuery(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } @@ -157,7 +156,7 @@ type systemEraseCmd struct { } func (cmd *systemEraseCmd) Execute(_ []string) error { - resp, err := control.SystemErase(context.Background(), cmd.ctlInvoker, new(control.SystemEraseReq)) + resp, err := control.SystemErase(cmd.MustLogCtx(), cmd.ctlInvoker, new(control.SystemEraseReq)) if err != nil { return err } @@ -190,7 +189,7 @@ func (cmd *systemStopCmd) Execute(_ []string) (errOut error) { req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) - resp, err := control.SystemStop(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemStop(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } @@ -231,7 +230,7 @@ func (cmd *baseExcludeCmd) execute(clear bool) error { req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) - resp, err := control.SystemExclude(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemExclude(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } @@ -291,7 +290,7 @@ func (cmd *systemStartCmd) Execute(_ []string) (errOut error) { req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) - resp, err := control.SystemStart(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemStart(cmd.MustLogCtx(), cmd.ctlInvoker, req) if err != nil { return err // control api returned an error, disregard response } @@ -330,7 +329,7 @@ func (cmd *systemCleanupCmd) Execute(_ []string) (errOut error) { errOut = errors.Wrap(errOut, "system cleanup failed") }() - ctx := context.Background() + ctx := cmd.MustLogCtx() req := new(control.SystemCleanupReq) req.SetSystem(cmd.config.SystemName) req.Machine = cmd.Args.Machine @@ -377,7 +376,7 @@ func (cmd *systemSetAttrCmd) Execute(_ []string) error { Attributes: cmd.Args.Attrs.ParsedProps, } - err := control.SystemSetAttr(context.Background(), cmd.ctlInvoker, req) + err := control.SystemSetAttr(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(nil, err) } @@ -429,7 +428,7 @@ func (cmd *systemGetAttrCmd) Execute(_ []string) error { Keys: cmd.Args.Attrs.ParsedProps.ToSlice(), } - resp, err := control.SystemGetAttr(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemGetAttr(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) } @@ -466,7 +465,7 @@ func (cmd *systemDelAttrCmd) Execute(_ []string) error { req.Attributes[key] = "" } - err := control.SystemSetAttr(context.Background(), cmd.ctlInvoker, req) + err := control.SystemSetAttr(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(nil, err) } @@ -542,7 +541,7 @@ func (cmd *systemSetPropCmd) Execute(_ []string) error { Properties: cmd.Args.Props.ParsedProps, } - err := control.SystemSetProp(context.Background(), cmd.ctlInvoker, req) + err := control.SystemSetProp(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(nil, err) } @@ -635,7 +634,7 @@ func (cmd *systemGetPropCmd) Execute(_ []string) error { Keys: cmd.Args.Props.ParsedProps, } - resp, err := control.SystemGetProp(context.Background(), cmd.ctlInvoker, req) + resp, err := control.SystemGetProp(cmd.MustLogCtx(), cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp.Properties, err) } diff --git a/src/control/cmd/dmg/telemetry.go b/src/control/cmd/dmg/telemetry.go index 9351496addec..fd63164b4386 100644 --- a/src/control/cmd/dmg/telemetry.go +++ b/src/control/cmd/dmg/telemetry.go @@ -9,7 +9,6 @@ package main import ( "archive/tar" "compress/gzip" - "context" "encoding/json" "fmt" "io" @@ -324,7 +323,7 @@ func (cmd *metricsListCmd) Execute(args []string) error { cmd.Info(getConnectingMsg(req.Host, req.Port)) } - resp, err := control.MetricsList(context.Background(), req) + resp, err := control.MetricsList(cmd.MustLogCtx(), req) if err != nil { return err } @@ -380,7 +379,7 @@ func (cmd *metricsQueryCmd) Execute(args []string) error { cmd.Info(getConnectingMsg(req.Host, req.Port)) } - resp, err := control.MetricsQuery(context.Background(), req) + resp, err := control.MetricsQuery(cmd.MustLogCtx(), req) if err != nil { return err } diff --git a/src/control/common/cmdutil/logging.go b/src/control/common/cmdutil/logging.go index 8b814eb21638..ab436fed5290 100644 --- a/src/control/common/cmdutil/logging.go +++ b/src/control/common/cmdutil/logging.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2023 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,6 +7,12 @@ package cmdutil import ( + "context" + "os" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/logging" ) @@ -24,9 +30,97 @@ type ( LogCmd struct { logging.Logger } + + // LogConfig contains parameters used to configure the logger. + LogConfig struct { + LogFile string + LogLevel common.ControlLogLevel + JSON bool + } ) // SetLog sets the logger for the command. func (cmd *LogCmd) SetLog(log logging.Logger) { cmd.Logger = log } + +// LogCtx returns a context with the command's logger set. +func (cmd *LogCmd) LogCtx() (context.Context, error) { + return logging.ToContext(context.Background(), cmd.Logger) +} + +// MustLogCtx returns a context with the command's logger set. +// NB: Panics on error. +func (cmd *LogCmd) MustLogCtx() context.Context { + ctx, err := cmd.LogCtx() + if err != nil { + panic(err) + } + return ctx +} + +// ConfigureLogger configures the logger according to the requested config. +func ConfigureLogger(logIn logging.Logger, cfg LogConfig) error { + log, ok := logIn.(*logging.LeveledLogger) + if !ok { + return errors.New("logger is not a LeveledLogger") + } + + // Set log level mask for default logger from config, + // unless it was explicitly set to debug via CLI flag. + applyLogConfig := func() error { + switch logging.LogLevel(cfg.LogLevel) { + case logging.LogLevelTrace: + log.SetLevel(logging.LogLevelTrace) + log.Debugf("Switching control log level to TRACE") + case logging.LogLevelDebug: + log.SetLevel(logging.LogLevelDebug) + log.Debugf("Switching control log level to DEBUG") + case logging.LogLevelNotice: + log.Debugf("Switching control log level to NOTICE") + log.SetLevel(logging.LogLevelNotice) + case logging.LogLevelError: + log.Debugf("Switching control log level to ERROR") + log.SetLevel(logging.LogLevelError) + } + + if cfg.JSON { + log = log.WithJSONOutput() + } + + log.Debugf("configured logging: level=%s, file=%s, json=%v", + cfg.LogLevel, cfg.LogFile, cfg.JSON) + + return nil + } + + hostname, err := os.Hostname() + if err != nil { + return errors.Wrap(err, "getting hostname") + } + + // Set log file for default logger if specified in config. + if cfg.LogFile != "" { + f, err := common.AppendFile(cfg.LogFile) + if err != nil { + return errors.Wrap(err, "create log file") + } + + log.Infof("%s logging to file %s", os.Args[0], cfg.LogFile) + + // Create an additional set of loggers which append everything + // to the specified file. + log = log. + WithErrorLogger(logging.NewErrorLogger(hostname, f)). + WithNoticeLogger(logging.NewNoticeLogger(hostname, f)). + WithInfoLogger(logging.NewInfoLogger(hostname, f)). + WithDebugLogger(logging.NewDebugLogger(f)). + WithTraceLogger(logging.NewTraceLogger(f)) + + return applyLogConfig() + } + + log.Info("no control log file specified; logging to stdout") + + return applyLogConfig() +} diff --git a/src/control/common/proto/ctl/addons.go b/src/control/common/proto/ctl/addons.go index bab491341d74..965b8f3226da 100644 --- a/src/control/common/proto/ctl/addons.go +++ b/src/control/common/proto/ctl/addons.go @@ -51,3 +51,14 @@ func (vls *LedState) UnmarshalJSON(data []byte) error { return nil } + +// IsScannable returns true if NVMe device state indicates controller details are accessible. +func (nc *NvmeController) IsScannable() bool { + return nc.DevState == NvmeDevState_NORMAL || nc.DevState == NvmeDevState_EVICTED || + nc.DevState == NvmeDevState_NEW +} + +// CanSupplyHealthStats returns true if NVMe device state indicates health stats are accessible. +func (nc *NvmeController) CanSupplyHealthStats() bool { + return nc.DevState == NvmeDevState_NORMAL || nc.DevState == NvmeDevState_EVICTED +} diff --git a/src/control/common/proto/ctl/smd.pb.go b/src/control/common/proto/ctl/smd.pb.go index 176c1294735e..2fd9edf0aede 100644 --- a/src/control/common/proto/ctl/smd.pb.go +++ b/src/control/common/proto/ctl/smd.pb.go @@ -84,28 +84,28 @@ func (NvmeDevState) EnumDescriptor() ([]byte, []int) { type LedState int32 const ( - LedState_OFF LedState = 0 // Equivalent to SPDK_VMD_LED_STATE_OFF + LedState_NA LedState = 0 // Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) LedState_QUICK_BLINK LedState = 1 // Equivalent to SPDK_VMD_LED_STATE_IDENTIFY (4Hz blink) LedState_ON LedState = 2 // Equivalent to SPDK_VMD_LED_STATE_FAULT (solid on) LedState_SLOW_BLINK LedState = 3 // Equivalent to SPDK_VMD_LED_STATE_REBUILD (1Hz blink) - LedState_NA LedState = 4 // Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) + LedState_OFF LedState = 4 // Equivalent to SPDK_VMD_LED_STATE_OFF ) // Enum value maps for LedState. var ( LedState_name = map[int32]string{ - 0: "OFF", + 0: "NA", 1: "QUICK_BLINK", 2: "ON", 3: "SLOW_BLINK", - 4: "NA", + 4: "OFF", } LedState_value = map[string]int32{ - "OFF": 0, + "NA": 0, "QUICK_BLINK": 1, "ON": 2, "SLOW_BLINK": 3, - "NA": 4, + "OFF": 4, } ) @@ -773,7 +773,7 @@ func (x *NvmeController) GetLedState() LedState { if x != nil { return x.LedState } - return LedState_OFF + return LedState_NA } func (x *NvmeController) GetPciDevType() string { @@ -1324,7 +1324,7 @@ func (x *LedManageReq) GetLedState() LedState { if x != nil { return x.LedState } - return LedState_OFF + return LedState_NA } func (x *LedManageReq) GetLedDurationMins() uint32 { @@ -2296,10 +2296,10 @@ var file_ctl_smd_proto_rawDesc = []byte{ 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x45, 0x56, 0x49, 0x43, 0x54, 0x45, 0x44, 0x10, 0x03, 0x12, 0x0d, 0x0a, 0x09, 0x55, 0x4e, 0x50, 0x4c, 0x55, 0x47, 0x47, 0x45, 0x44, 0x10, 0x04, 0x2a, 0x44, 0x0a, 0x08, 0x4c, 0x65, - 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x4f, 0x46, 0x46, 0x10, 0x00, 0x12, - 0x0f, 0x0a, 0x0b, 0x51, 0x55, 0x49, 0x43, 0x4b, 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x01, - 0x12, 0x06, 0x0a, 0x02, 0x4f, 0x4e, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x53, 0x4c, 0x4f, 0x57, - 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x4e, 0x41, 0x10, 0x04, + 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x06, 0x0a, 0x02, 0x4e, 0x41, 0x10, 0x00, 0x12, 0x0f, + 0x0a, 0x0b, 0x51, 0x55, 0x49, 0x43, 0x4b, 0x5f, 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x01, 0x12, + 0x06, 0x0a, 0x02, 0x4f, 0x4e, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x53, 0x4c, 0x4f, 0x57, 0x5f, + 0x42, 0x4c, 0x49, 0x4e, 0x4b, 0x10, 0x03, 0x12, 0x07, 0x0a, 0x03, 0x4f, 0x46, 0x46, 0x10, 0x04, 0x2a, 0x28, 0x0a, 0x09, 0x4c, 0x65, 0x64, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x07, 0x0a, 0x03, 0x47, 0x45, 0x54, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x45, 0x54, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x45, 0x53, 0x45, 0x54, 0x10, 0x02, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go index 071ba10e04a9..62fede43ed4c 100644 --- a/src/control/common/proto/ctl/storage_nvme.pb.go +++ b/src/control/common/proto/ctl/storage_nvme.pb.go @@ -32,8 +32,9 @@ type NvmeControllerResult struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - PciAddr string `protobuf:"bytes,1,opt,name=pci_addr,json=pciAddr,proto3" json:"pci_addr,omitempty"` // PCI address of NVMe controller - State *ResponseState `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` // state of current operation + PciAddr string `protobuf:"bytes,1,opt,name=pci_addr,json=pciAddr,proto3" json:"pci_addr,omitempty"` // PCI address of NVMe controller + State *ResponseState `protobuf:"bytes,2,opt,name=state,proto3" json:"state,omitempty"` // state of current operation + RoleBits uint32 `protobuf:"varint,3,opt,name=role_bits,json=roleBits,proto3" json:"role_bits,omitempty"` // Device active roles (bitmask) } func (x *NvmeControllerResult) Reset() { @@ -82,6 +83,13 @@ func (x *NvmeControllerResult) GetState() *ResponseState { return nil } +func (x *NvmeControllerResult) GetRoleBits() uint32 { + if x != nil { + return x.RoleBits + } + return 0 +} + type ScanNvmeReq struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -260,33 +268,35 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x0a, 0x16, 0x63, 0x74, 0x6c, 0x2f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x6e, 0x76, 0x6d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x03, 0x63, 0x74, 0x6c, 0x1a, 0x10, 0x63, 0x74, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, - 0x0d, 0x63, 0x74, 0x6c, 0x2f, 0x73, 0x6d, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x5b, + 0x0d, 0x63, 0x74, 0x6c, 0x2f, 0x73, 0x6d, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x78, 0x0a, 0x14, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x19, 0x0a, 0x08, 0x70, 0x63, 0x69, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x70, 0x63, 0x69, 0x41, 0x64, 0x64, 0x72, 0x12, 0x28, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, - 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x85, 0x01, 0x0a, 0x0b, - 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, - 0x65, 0x61, 0x6c, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, - 0x6c, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x08, 0x52, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x12, 0x14, 0x0a, 0x05, 0x42, 0x61, 0x73, 0x69, 0x63, - 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x42, 0x61, 0x73, 0x69, 0x63, 0x12, 0x1a, 0x0a, - 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, - 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, - 0x53, 0x69, 0x7a, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, - 0x69, 0x7a, 0x65, 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, - 0x65, 0x73, 0x70, 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, - 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, - 0x12, 0x28, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, - 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, - 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, - 0x72, 0x6d, 0x61, 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, - 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, - 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x2f, 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72, + 0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, + 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, + 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, + 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, + 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, + 0x4d, 0x65, 0x74, 0x61, 0x12, 0x14, 0x0a, 0x05, 0x42, 0x61, 0x73, 0x69, 0x63, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x05, 0x42, 0x61, 0x73, 0x69, 0x63, 0x12, 0x1a, 0x0a, 0x08, 0x4d, 0x65, + 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65, + 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, + 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, + 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, + 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, + 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, + 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, + 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, + 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, + 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, + 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, + 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, + 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go index 4a368aeb5e3e..f444453c6b77 100644 --- a/src/control/fault/code/codes.go +++ b/src/control/fault/code/codes.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2023 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -152,6 +152,7 @@ const ( ServerIncompatibleComponents ServerNoCompatibilityInsecure ServerPoolHasContainers + ServerHugepagesDisabled ) // server config fault codes @@ -182,7 +183,7 @@ const ( ServerConfigFaultCallbackEmpty ServerConfigFaultDomainTooManyLayers ServerConfigNrHugepagesOutOfRange - ServerConfigHugepagesDisabled + ServerConfigHugepagesDisabledWithBdevs ServerConfigVMDSettingDuplicate ServerConfigEngineNUMAImbalance ServerConfigControlMetadataNoPath diff --git a/src/control/lib/control/auto_test.go b/src/control/lib/control/auto_test.go index 749c603276cf..beceb7b7b59e 100644 --- a/src/control/lib/control/auto_test.go +++ b/src/control/lib/control/auto_test.go @@ -522,10 +522,14 @@ func TestControl_AutoConfig_getStorageSet(t *testing.T) { expStorageSet: &HostStorageSet{ HostSet: hostlist.MustCreateSet("host[1-2]"), HostStorage: &HostStorage{ - NvmeDevices: storage.NvmeControllers{storage.MockNvmeController()}, - ScmModules: storage.ScmModules{storage.MockScmModule()}, - ScmNamespaces: storage.ScmNamespaces{storage.MockScmNamespace(0)}, - MemInfo: MockMemInfo(), + NvmeDevices: storage.NvmeControllers{ + mockNvmeCtrlrWithSmd(storage.OptionBits(0)), + }, + ScmModules: storage.ScmModules{storage.MockScmModule()}, + ScmNamespaces: storage.ScmNamespaces{ + storage.MockScmNamespace(0), + }, + MemInfo: MockMemInfo(), }, }, }, @@ -569,7 +573,7 @@ func TestControl_AutoConfig_getStorageSet(t *testing.T) { }, defResCmpOpts()...) if diff := cmp.Diff(tc.expStorageSet, storageSet, cmpOpts...); diff != "" { - t.Fatalf("unexpected network set (-want, +got):\n%s\n", diff) + t.Fatalf("unexpected storage set (-want, +got):\n%s\n", diff) } }) } diff --git a/src/control/lib/control/mocks.go b/src/control/lib/control/mocks.go index c70bd89e5792..bf080d8c8f6f 100644 --- a/src/control/lib/control/mocks.go +++ b/src/control/lib/control/mocks.go @@ -175,8 +175,6 @@ func (mi *MockInvoker) InvokeUnaryRPCAsync(ctx context.Context, uReq UnaryReques return } } - - mi.log.Debug("sending mock response") responses <- hr } close(responses) @@ -318,21 +316,30 @@ func MockMemInfo() *common.MemInfo { } } +func mockNvmeCtrlrWithSmd(bdevRoles storage.OptionBits, varIdx ...int32) *storage.NvmeController { + idx := test.GetIndex(varIdx...) + nc := storage.MockNvmeController(idx) + sd := storage.MockSmdDevice(nil, idx) + sd.Roles = storage.BdevRoles{bdevRoles} + nc.SmdDevices = []*storage.SmdDevice{sd} + return nc +} + func standardServerScanResponse(t *testing.T) *ctlpb.StorageScanResp { pbSsr := &ctlpb.StorageScanResp{ Nvme: &ctlpb.ScanNvmeResp{}, Scm: &ctlpb.ScanScmResp{}, MemInfo: commonpb.MockPBMemInfo(), } + nvmeControllers := storage.NvmeControllers{ - storage.MockNvmeController(), - } - scmModules := storage.ScmModules{ - storage.MockScmModule(), + mockNvmeCtrlrWithSmd(storage.OptionBits(0)), } if err := convert.Types(nvmeControllers, &pbSsr.Nvme.Ctrlrs); err != nil { t.Fatal(err) } + + scmModules := storage.ScmModules{storage.MockScmModule()} if err := convert.Types(scmModules, &pbSsr.Scm.Modules); err != nil { t.Fatal(err) } @@ -355,7 +362,7 @@ func MockServerScanResp(t *testing.T, variant string) *ctlpb.StorageScanResp { ctrlrs := func(idxs ...int) storage.NvmeControllers { ncs := make(storage.NvmeControllers, 0, len(idxs)) for _, i := range idxs { - nc := storage.MockNvmeController(int32(i)) + nc := mockNvmeCtrlrWithSmd(storage.BdevRoleAll, int32(i)) ncs = append(ncs, nc) } return ncs @@ -541,6 +548,7 @@ type MockFormatConf struct { NvmePerHost int ScmFailures map[int]struct{} NvmeFailures map[int]struct{} + NvmeRoleBits int } // MockFormatResp returns a populated StorageFormatResp based on input config. @@ -583,6 +591,13 @@ func MockFormatResp(t *testing.T, mfc MockFormatConf) *StorageFormatResp { hs.NvmeDevices = append(hs.NvmeDevices, &storage.NvmeController{ Info: ctlpb.ResponseStatus_CTL_SUCCESS.String(), PciAddr: fmt.Sprintf("%d", j+1), + SmdDevices: []*storage.SmdDevice{ + { + Roles: storage.BdevRoles{ + storage.OptionBits(mfc.NvmeRoleBits), + }, + }, + }, }) } if err := hsm.Add(hostName, hs); err != nil { diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index 79b1efaf1e22..24327914275c 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -1365,7 +1365,7 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl for _, smdDevice := range controller.SmdDevices { if !smdDevice.Roles.IsEmpty() && (smdDevice.Roles.OptionBits&storage.BdevRoleData) == 0 { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not used for storing data", - smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr) continue } @@ -1377,7 +1377,7 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl if !filterRank(smdDevice.Rank) { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not in ranklist", - smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr) continue } diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 12f8389eadcb..8f015970b275 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -141,7 +141,6 @@ type ( unaryRequest Usage bool NvmeHealth bool - NvmeMeta bool NvmeBasic bool } @@ -240,7 +239,7 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe Basic: req.NvmeBasic, // Health and meta details required to populate usage statistics. Health: req.NvmeHealth || req.Usage, - Meta: req.NvmeMeta || req.Usage, + Meta: req.Usage, }, }) }) @@ -306,6 +305,13 @@ func (sfr *StorageFormatResp) addHostResponse(hr *HostResponse) (err error) { hs.NvmeDevices = append(hs.NvmeDevices, &storage.NvmeController{ Info: info, PciAddr: nr.GetPciAddr(), + SmdDevices: []*storage.SmdDevice{ + { + Roles: storage.BdevRoles{ + storage.OptionBits(nr.RoleBits), + }, + }, + }, }) default: if err := ctlStateToErr(nr.GetState()); err != nil { diff --git a/src/control/lib/daos/pool_cont_prop.go b/src/control/lib/daos/pool_cont_prop.go index e7fd6703e5e8..91b6e0ac8b67 100644 --- a/src/control/lib/daos/pool_cont_prop.go +++ b/src/control/lib/daos/pool_cont_prop.go @@ -126,7 +126,9 @@ const ( //PoolPropertyPerfDomain is pool performance domain PoolPropertyPerfDomain = C.DAOS_PROP_PO_PERF_DOMAIN //PoolPropertyReintMode is pool reintegration mode - PoolPropertyReintMode = C.DAOS_PROP_PO_REINT_MODE + PoolPropertyReintMode = C.DAOS_PROP_PO_REINT_MODE + PoolPropertySvcOpsEnabled = C.DAOS_PROP_PO_SVC_OPS_ENABLED + PoolPropertySvcOpsEntryAge = C.DAOS_PROP_PO_SVC_OPS_ENTRY_AGE ) const ( @@ -185,6 +187,8 @@ const ( PoolSvcRedunFacMax = C.DAOS_PROP_PO_SVC_REDUN_FAC_MAX // PoolSvcRedunFacDefault defines the default value of PoolPropertySvcRedunFac. PoolSvcRedunFacDefault = C.DAOS_PROP_PO_SVC_REDUN_FAC_DEFAULT + PoolSvcOpsEntryAgeMin = C.DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MIN + PoolSvcOpsEntryAgeMax = C.DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MAX ) const ( diff --git a/src/control/lib/daos/pool_property.go b/src/control/lib/daos/pool_property.go index 6e9dedd5e40e..13434419d25d 100644 --- a/src/control/lib/daos/pool_property.go +++ b/src/control/lib/daos/pool_property.go @@ -95,6 +95,56 @@ func PoolProperties() PoolPropertyMap { valueMarshaler: numericMarshaler, }, }, + "svc_ops_enabled": { + Property: PoolProperty{ + Number: PoolPropertySvcOpsEnabled, + Description: "Metadata duplicate operations detection enabled", + valueHandler: func(s string) (*PoolPropertyValue, error) { + oeErr := errors.Errorf("invalid svc_ops_enabled value %s (valid values: 0-1)", s) + oeVal, err := strconv.ParseUint(s, 10, 32) + if err != nil { + return nil, oeErr + } + if oeVal > 1 { + return nil, errors.Wrap(oeErr, "value supplied is greater than 1") + } + return &PoolPropertyValue{oeVal}, nil + }, + valueStringer: func(v *PoolPropertyValue) string { + n, err := v.GetNumber() + if err != nil { + return "not set" + } + return fmt.Sprintf("%d", n) + }, + valueMarshaler: numericMarshaler, + }, + }, + "svc_ops_entry_age": { + Property: PoolProperty{ + Number: PoolPropertySvcOpsEntryAge, + Description: "Metadata duplicate operations KVS max entry age, in seconds", + valueHandler: func(s string) (*PoolPropertyValue, error) { + oeErr := errors.Errorf("invalid svc_ops_entry_age %s (valid values: %d-%d)", s, PoolSvcOpsEntryAgeMin, PoolSvcOpsEntryAgeMax) + oeVal, err := strconv.ParseUint(s, 10, 32) + if err != nil { + return nil, oeErr + } + if oeVal < PoolSvcOpsEntryAgeMin || oeVal > PoolSvcOpsEntryAgeMax { + return nil, errors.Wrap(oeErr, "value supplied is out of range") + } + return &PoolPropertyValue{oeVal}, nil + }, + valueStringer: func(v *PoolPropertyValue) string { + n, err := v.GetNumber() + if err != nil { + return "not set" + } + return fmt.Sprintf("%d", n) + }, + valueMarshaler: numericMarshaler, + }, + }, "label": { Property: PoolProperty{ Number: PoolPropertyLabel, diff --git a/src/control/lib/spdk/nvme.go b/src/control/lib/spdk/nvme.go index b6b38edd044a..b2a06c056125 100644 --- a/src/control/lib/spdk/nvme.go +++ b/src/control/lib/spdk/nvme.go @@ -76,15 +76,6 @@ func wrapCleanError(inErr error, cleanErr error) (outErr error) { return } -func ctrlrPCIAddresses(ctrlrs storage.NvmeControllers) []string { - pciAddrs := make([]string, 0, len(ctrlrs)) - for _, c := range ctrlrs { - pciAddrs = append(pciAddrs, c.PciAddr) - } - - return pciAddrs -} - func resultPCIAddresses(results []*FormatResult) []string { pciAddrs := make([]string, 0, len(results)) for _, r := range results { diff --git a/src/control/lib/spdk/nvme_default.go b/src/control/lib/spdk/nvme_default.go index d9efbc69fc9a..4e1f40376a12 100644 --- a/src/control/lib/spdk/nvme_default.go +++ b/src/control/lib/spdk/nvme_default.go @@ -57,8 +57,11 @@ func (n *NvmeImpl) Discover(log logging.Logger) (storage.NvmeControllers, error) ctrlrs, err := collectCtrlrs(C.nvme_discover(), "NVMe Discover(): C.nvme_discover") - pciAddrs := ctrlrPCIAddresses(ctrlrs) - log.Debugf("discovered nvme ssds: %v", pciAddrs) + pciAddrs := make([]string, 0, len(ctrlrs)) + for _, c := range ctrlrs { + log.Debugf("nvme ssd scanned: %+v", c) + pciAddrs = append(pciAddrs, c.PciAddr) + } return ctrlrs, wrapCleanError(err, cleanLockfiles(log, realRemove, pciAddrs...)) } diff --git a/src/control/lib/spdk/src/nvme_control_common.c b/src/control/lib/spdk/src/nvme_control_common.c index 5bbecccbcb5b..4d7d138fd08e 100644 --- a/src/control/lib/spdk/src/nvme_control_common.c +++ b/src/control/lib/spdk/src/nvme_control_common.c @@ -250,25 +250,17 @@ _discover(prober probe, bool detach, health_getter get_health) } static int -str2ctrlr(char **dst, const void *src) +str2ctrlr(char **dst, const void *src, size_t size) { - int len; - assert(src != NULL); assert(dst != NULL); assert(*dst == NULL); - len = strnlen((const char *)src, NVME_DETAIL_BUFLEN); - if (len == NVME_DETAIL_BUFLEN) { - perror("src buf too big"); - return -NVMEC_ERR_CHK_SIZE; - } - - *dst = calloc(1, len + 1); + *dst = calloc(1, size + 1); if (*dst == NULL) return -ENOMEM; - if (copy_ascii(*dst, len + 1, src, len) != 0) { + if (copy_ascii(*dst, size + 1, src, size) != 0) { perror("copy_ascii"); return -NVMEC_ERR_CHK_SIZE; } @@ -281,18 +273,15 @@ copy_ctrlr_data(struct nvme_ctrlr_t *cdst, const struct spdk_nvme_ctrlr_data *cd { int rc; - rc = str2ctrlr(&cdst->model, cdata->mn); - if (rc != 0) { + rc = str2ctrlr(&cdst->model, cdata->mn, sizeof(cdata->mn)); + if (rc != 0) return rc; - } - rc = str2ctrlr(&cdst->serial, cdata->sn); - if (rc != 0) { + rc = str2ctrlr(&cdst->serial, cdata->sn, sizeof(cdata->sn)); + if (rc != 0) return rc; - } - rc = str2ctrlr(&cdst->fw_rev, cdata->fr); - if (rc != 0) { + rc = str2ctrlr(&cdst->fw_rev, cdata->fr, sizeof(cdata->fr)); + if (rc != 0) return rc; - } return 0; } diff --git a/src/control/logging/context.go b/src/control/logging/context.go new file mode 100644 index 000000000000..f3250fd562bb --- /dev/null +++ b/src/control/logging/context.go @@ -0,0 +1,52 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package logging + +import ( + "context" + "errors" +) + +type contextKeyType string + +var contextKey contextKeyType = "logging.Logger" + +func getCtxLogger(ctx context.Context) (Logger, bool) { + if ctx == nil { + return nil, false + } + + if logger, ok := ctx.Value(contextKey).(Logger); ok { + return logger, true + } + return nil, false +} + +// FromContext returns the logger from the context, +// or a no-op logger if no logger is present. +func FromContext(ctx context.Context) Logger { + if logger, ok := getCtxLogger(ctx); ok { + return logger + } + return &LeveledLogger{level: LogLevelDisabled} +} + +// ToContext adds the logger to the context if +// it is not already present. +func ToContext(ctx context.Context, logger Logger) (context.Context, error) { + if ctx == nil { + return nil, errors.New("nil context") + } + if logger == nil { + return nil, errors.New("nil logger") + } + + if _, ok := getCtxLogger(ctx); ok { + return nil, errors.New("logger already present in context") + } + return context.WithValue(ctx, contextKey, logger), nil +} diff --git a/src/control/logging/context_test.go b/src/control/logging/context_test.go new file mode 100644 index 000000000000..8b798264072c --- /dev/null +++ b/src/control/logging/context_test.go @@ -0,0 +1,66 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package logging_test + +import ( + "strings" + "testing" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestLogging_ToFromContext(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + ctx, err := logging.ToContext(test.Context(t), log) + if err != nil { + t.Fatal(err) + } + log2 := logging.FromContext(ctx) + log2.Info("test") + + if !strings.Contains(buf.String(), "test") { + t.Fatal("expected test message in log buffer") + } +} + +func TestLogging_FromContext_Unset(t *testing.T) { + log := logging.FromContext(test.Context(t)) + log.Info("shouldn't panic (noop)") +} + +func TestLogging_FromContext_NilCtx(t *testing.T) { + log := logging.FromContext(nil) + log.Info("shouldn't panic (noop)") +} + +func TestLogging_ToContext_NilCtx(t *testing.T) { + _, err := logging.ToContext(nil, &logging.LeveledLogger{}) + if err == nil { + t.Fatal("expected error") + } +} + +func TestLogging_ToContext_NilLogger(t *testing.T) { + _, err := logging.ToContext(test.Context(t), nil) + if err == nil { + t.Fatal("expected error") + } +} + +func TestLogging_ToContext_AlreadySet(t *testing.T) { + ctx, err := logging.ToContext(test.Context(t), &logging.LeveledLogger{}) + if err != nil { + t.Fatal(err) + } + _, err = logging.ToContext(ctx, &logging.LeveledLogger{}) + if err == nil { + t.Fatal("expected error") + } +} diff --git a/src/control/server/config/faults.go b/src/control/server/config/faults.go index 67148dbd0102..31c034d3e5b6 100644 --- a/src/control/server/config/faults.go +++ b/src/control/server/config/faults.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -91,10 +91,10 @@ var ( "the fault domain path may have a maximum of 2 levels below the root", "update either the fault domain ('fault_path' parameter) or callback script ('fault_cb' parameter) and restart the control server", ) - FaultConfigHugepagesDisabled = serverConfigFault( - code.ServerConfigHugepagesDisabled, + FaultConfigHugepagesDisabledWithBdevs = serverConfigFault( + code.ServerConfigHugepagesDisabledWithBdevs, "hugepages cannot be disabled if bdevs have been specified in config", - "remove nr_hugepages parameter from config to have the value automatically calculated", + "either set false (or remove) disable_hugepages parameter or remove nvme storage assignment in config and restart the control server", ) FaultConfigVMDSettingDuplicate = serverConfigFault( code.ServerConfigVMDSettingDuplicate, diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index 4b6eeb911ab8..6242478413e6 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -488,7 +488,7 @@ func (cfg *Server) SetNrHugepages(log logging.Logger, mi *common.MemInfo) error } if cfg.DisableHugepages { - return FaultConfigHugepagesDisabled + return FaultConfigHugepagesDisabledWithBdevs } // Calculate minimum number of hugepages for all configured engines. diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 48ead6d68fba..55082875f4d3 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -289,7 +289,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithEnvVars("CRT_TIMEOUT=30"). WithLogFile("/tmp/daos_engine.0.log"). WithLogMask("INFO"). - WithStorageEnableHotplug(true), + WithStorageEnableHotplug(true). + WithStorageAutoFaultyCriteria(true, 100, 200), engine.MockConfig(). WithSystemName("daos_server"). WithSocketDir("./.daos/daos_server"). @@ -316,7 +317,8 @@ func TestServerConfig_Constructed(t *testing.T) { WithEnvVars("CRT_TIMEOUT=100"). WithLogFile("/tmp/daos_engine.1.log"). WithLogMask("INFO"). - WithStorageEnableHotplug(true), + WithStorageEnableHotplug(true). + WithStorageAutoFaultyCriteria(false, 0, 0), } constructed.Path = testFile // just to avoid failing the cmp @@ -863,7 +865,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { ), ) }, - expErr: FaultConfigHugepagesDisabled, + expErr: FaultConfigHugepagesDisabledWithBdevs, }, "disabled hugepages; emulated bdevs configured": { extraConfig: func(c *Server) *Server { @@ -883,7 +885,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { ), ) }, - expErr: FaultConfigHugepagesDisabled, + expErr: FaultConfigHugepagesDisabledWithBdevs, }, "disabled hugepages; no bdevs configured": { extraConfig: func(c *Server) *Server { diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index 9cbdaeafbbe4..1d8c20d843f1 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -27,6 +27,7 @@ const ( devStateNew = ctlpb.NvmeDevState_NEW devStateNormal = ctlpb.NvmeDevState_NORMAL devStateFaulty = ctlpb.NvmeDevState_EVICTED + devStateUnplug = ctlpb.NvmeDevState_UNPLUGGED ledStateIdentify = ctlpb.LedState_QUICK_BLINK ledStateNormal = ctlpb.LedState_OFF @@ -267,6 +268,15 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { LedState: ledStateFault, }, }, + { + Uuid: test.MockUUID(2), + TgtIds: []int32{}, + Ctrlr: &ctlpb.NvmeController{ + PciAddr: "0000:8b:00.0", + DevState: devStateUnplug, + LedState: ledStateUnknown, + }, + }, }, }, }, @@ -276,7 +286,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { Message: &ctlpb.SmdDevResp{ Devices: []*ctlpb.SmdDevice{ { - Uuid: test.MockUUID(2), + Uuid: test.MockUUID(3), TgtIds: []int32{0, 1, 2}, Ctrlr: &ctlpb.NvmeController{ PciAddr: "0000:da:00.0", @@ -285,7 +295,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, }, { - Uuid: test.MockUUID(3), + Uuid: test.MockUUID(4), TgtIds: []int32{3, 4, 5}, Ctrlr: &ctlpb.NvmeController{ PciAddr: "0000:db:00.0", @@ -320,13 +330,22 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { LedState: ledStateFault, }, }, + { + Uuid: test.MockUUID(2), + TgtIds: []int32{}, + Ctrlr: &ctlpb.NvmeController{ + PciAddr: "0000:8b:00.0", + DevState: devStateUnplug, + LedState: ledStateUnknown, + }, + }, }, Rank: uint32(0), }, { Devices: []*ctlpb.SmdDevice{ { - Uuid: test.MockUUID(2), + Uuid: test.MockUUID(3), TgtIds: []int32{0, 1, 2}, Ctrlr: &ctlpb.NvmeController{ PciAddr: "0000:da:00.0", @@ -335,7 +354,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, }, { - Uuid: test.MockUUID(3), + Uuid: test.MockUUID(4), TgtIds: []int32{3, 4, 5}, Ctrlr: &ctlpb.NvmeController{ PciAddr: "0000:db:00.0", @@ -1561,7 +1580,7 @@ func TestServer_CtlSvc_SmdManage(t *testing.T) { svc.harness.started.SetTrue() for i, e := range svc.harness.instances { - srv := e.(*EngineInstance) + ei := e.(*EngineInstance) cfg := new(mockDrpcClientConfig) if tc.junkResp { cfg.setSendMsgResponse(drpc.Status_SUCCESS, makeBadBytes(42), nil) @@ -1570,8 +1589,8 @@ func TestServer_CtlSvc_SmdManage(t *testing.T) { cfg.setSendMsgResponseList(t, mock) } } - srv.setDrpcClient(newMockDrpcClient(cfg)) - srv.ready.SetTrue() + ei.setDrpcClient(newMockDrpcClient(cfg)) + ei.ready.SetTrue() } if tc.harnessStopped { svc.harness.started.SetFalse() diff --git a/src/control/server/ctl_storage.go b/src/control/server/ctl_storage.go index 443f2a0bc76e..12ceb21a3214 100644 --- a/src/control/server/ctl_storage.go +++ b/src/control/server/ctl_storage.go @@ -101,25 +101,26 @@ func (cs *ControlService) getScmUsage(ssr *storage.ScmScanResponse) (*storage.Sc instances := cs.harness.Instances() - nss := make(storage.ScmNamespaces, len(instances)) - for idx, ei := range instances { - if !ei.IsReady() { + nss := make(storage.ScmNamespaces, 0, len(instances)) + for _, engine := range instances { + if !engine.IsReady() { continue // skip if not running } - cfg, err := ei.GetStorage().GetScmConfig() + cfg, err := engine.GetStorage().GetScmConfig() if err != nil { return nil, err } - mount, err := ei.GetStorage().GetScmUsage() + mount, err := engine.GetStorage().GetScmUsage() if err != nil { return nil, err } + var ns *storage.ScmNamespace switch mount.Class { case storage.ClassRam: // generate fake namespace for emulated ramdisk mounts - nss[idx] = &storage.ScmNamespace{ + ns = &storage.ScmNamespace{ Mount: mount, BlockDevice: "ramdisk", Size: uint64(humanize.GiByte * cfg.Scm.RamdiskSize), @@ -127,29 +128,32 @@ func (cs *ControlService) getScmUsage(ssr *storage.ScmScanResponse) (*storage.Sc case storage.ClassDcpm: // update namespace mount info for online storage if ssr.Namespaces == nil { return nil, errors.Errorf("instance %d: input scm scan response missing namespaces", - ei.Index()) + engine.Index()) } - ns := findPMemInScan(ssr, mount.DeviceList) + ns = findPMemInScan(ssr, mount.DeviceList) if ns == nil { return nil, errors.Errorf("instance %d: no pmem namespace for mount %s", - ei.Index(), mount.Path) + engine.Index(), mount.Path) } ns.Mount = mount - nss[idx] = ns } - if nss[idx].Mount != nil { - rank, err := ei.GetRank() + if ns.Mount != nil { + rank, err := engine.GetRank() if err != nil { return nil, errors.Wrapf(err, "instance %d: no rank associated for mount %s", - ei.Index(), mount.Path) + engine.Index(), mount.Path) } - nss[idx].Mount.Rank = rank + ns.Mount.Rank = rank } - cs.log.Debugf("updated scm fs usage on device %s mounted at %s: %+v", - nss[idx].BlockDevice, mount.Path, nss[idx].Mount) + cs.log.Debugf("updated scm fs usage on device %s mounted at %s: %+v", ns.BlockDevice, + mount.Path, ns.Mount) + nss = append(nss, ns) } + if len(nss) == 0 { + return nil, errors.New("no scm details found") + } return &storage.ScmScanResponse{Namespaces: nss}, nil } diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index a7a87de805a0..71339918876f 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -13,25 +13,27 @@ import ( "strconv" "github.com/dustin/go-humanize" - "github.com/dustin/go-humanize/english" "github.com/pkg/errors" "golang.org/x/net/context" "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/proto" "github.com/daos-stack/daos/src/control/common/proto/convert" - "github.com/daos-stack/daos/src/control/common/proto/ctl" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/engine" "github.com/daos-stack/daos/src/control/server/storage" ) const ( - msgFormatErr = "instance %d: failure formatting storage, check RPC response for details" - msgNvmeFormatSkip = "NVMe format skipped on instance %d as SCM format did not complete" + msgFormatErr = "instance %d: failure formatting storage, check RPC response for details" + msgNvmeFormatSkip = "NVMe format skipped on instance %d" + msgNvmeFormatSkipHPD = msgNvmeFormatSkip + ", use of hugepages disabled in config" + msgNvmeFormatSkipFail = msgNvmeFormatSkip + ", SCM format failed" + msgNvmeFormatSkipNotDone = msgNvmeFormatSkip + ", SCM was not formatted" // Storage size reserved for storing DAOS metadata stored on SCM device. // // NOTE This storage size value is larger than the minimal size observed (i.e. 36864B), @@ -48,6 +50,11 @@ const ( mdFsScmBytes uint64 = humanize.MiByte ) +var ( + errNoSrvCfg = errors.New("ControlService has no server config") + errNilReq = errors.New("nil request") +) + // newResponseState creates, populates and returns ResponseState. func newResponseState(inErr error, badStatus ctlpb.ResponseStatus, infoMsg string) *ctlpb.ResponseState { rs := new(ctlpb.ResponseState) @@ -70,8 +77,39 @@ var ( type scanBdevsFn func(storage.BdevScanRequest) (*storage.BdevScanResponse, error) +func ctrlrToPciStr(nc *ctlpb.NvmeController) (string, error) { + pciAddr, err := hardware.NewPCIAddress(nc.GetPciAddr()) + if err != nil { + return "", errors.Wrapf(err, "Invalid PCI address") + } + if pciAddr.IsVMDBackingAddress() { + if pciAddr, err = pciAddr.BackingToVMDAddress(); err != nil { + return "", errors.Wrapf(err, "Invalid VMD address") + } + } + + return pciAddr.String(), nil +} + +func findBdevTier(pciAddr string, tcs storage.TierConfigs) *storage.TierConfig { + for _, tc := range tcs { + if !tc.IsBdev() { + continue + } + for _, name := range tc.Bdev.DeviceList.Devices() { + if pciAddr == name { + return tc + } + } + } + + return nil +} + // Convert bdev scan results to protobuf response. -func bdevScanToProtoResp(scan scanBdevsFn, req storage.BdevScanRequest) (*ctlpb.ScanNvmeResp, error) { +func bdevScanToProtoResp(scan scanBdevsFn, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) { + req := storage.BdevScanRequest{DeviceList: bdevCfgs.Bdevs()} + resp, err := scan(req) if err != nil { return nil, err @@ -83,38 +121,58 @@ func bdevScanToProtoResp(scan scanBdevsFn, req storage.BdevScanRequest) (*ctlpb. return nil, err } + if bdevCfgs.HaveRealNVMe() { + // Update proto Ctrlrs with role info and normal (DAOS) state for off-line display. + for _, c := range pbCtrlrs { + pciAddrStr, err := ctrlrToPciStr(c) + if err != nil { + return nil, err + } + bc := findBdevTier(pciAddrStr, bdevCfgs) + if bc == nil { + return nil, errors.Errorf("unknown PCI device, scanned ctrlr %q "+ + "not found in cfg", pciAddrStr) + } + if len(c.SmdDevices) != 0 { + return nil, errors.Errorf("scanned ctrlr %q has unexpected smd", + pciAddrStr) + } + c.SmdDevices = append(c.SmdDevices, &ctlpb.SmdDevice{ + RoleBits: uint32(bc.Bdev.DeviceRoles.OptionBits), + Rank: uint32(ranklist.NilRank), + }) + c.DevState = ctlpb.NvmeDevState_NORMAL + } + } + return &ctlpb.ScanNvmeResp{ State: new(ctlpb.ResponseState), Ctrlrs: pbCtrlrs, }, nil } -// Scan bdevs through harness's ControlService (not per-engine). -func bdevScanGlobal(cs *ControlService, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) { - req := storage.BdevScanRequest{DeviceList: cfgBdevs} - return bdevScanToProtoResp(cs.storage.ScanBdevs, req) -} - // Scan bdevs through each engine and collate response results. func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (*ctlpb.ScanNvmeResp, error) { var errLast error instances := cs.harness.Instances() resp := &ctlpb.ScanNvmeResp{} - for _, ei := range instances { + for _, engine := range instances { eReq := new(ctlpb.ScanNvmeReq) *eReq = *req if req.Meta { - ms, rs, err := computeMetaRdbSz(cs, ei, nsps) + ms, rs, err := computeMetaRdbSz(cs, engine, nsps) if err != nil { return nil, errors.Wrap(err, "computing meta and rdb size") } eReq.MetaSize, eReq.RdbSize = ms, rs } - respEng, err := scanEngineBdevs(ctx, ei, eReq) + // If partial number of engines return results, indicate errors for non-ready + // engines whilst returning successful scanmresults. + respEng, err := scanEngineBdevs(ctx, engine, eReq) if err != nil { - err = errors.Wrapf(err, "instance %d", ei.Index()) + err = errors.Wrapf(err, "instance %d", engine.Index()) if errLast == nil && len(instances) > 1 { errLast = err // Save err to preserve partial results. cs.log.Error(err.Error()) @@ -140,10 +198,8 @@ func bdevScanTrimResults(req *ctlpb.ScanNvmeReq, resp *ctlpb.ScanNvmeResp) *ctlp if !req.GetHealth() { pbc.HealthStats = nil } - if !req.GetMeta() { - pbc.SmdDevices = nil - } if req.GetBasic() { + pbc.SmdDevices = nil pbc.Serial = "" pbc.Model = "" pbc.FwRev = "" @@ -163,11 +219,15 @@ func engineHasStarted(instances []Engine) bool { return false } -func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace, hasStarted *bool, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) { +func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace, hasStarted *bool, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) { *hasStarted = engineHasStarted(cs.harness.Instances()) if !*hasStarted { cs.log.Debugf("scan bdevs from control service as no engines started") - return bdevScanGlobal(cs, cfgBdevs) + if req.Meta { + return nil, errors.New("meta smd usage info unavailable as engines stopped") + } + + return bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs) } // Delegate scan to engine instances as soon as one engine with assigned bdevs has started. @@ -179,19 +239,30 @@ func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNv // or not. If running, scan over dRPC. If not running then use engine's storage provider. func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (resp *ctlpb.ScanNvmeResp, err error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq } + if cs.srvCfg != nil && cs.srvCfg.DisableHugepages { + return nil, errors.New("cannot scan bdevs if hugepages have been disabled") + } + + defer func() { + if err == nil && req.Meta { + cs.adjustNvmeSize(resp) + } + }() - cfgBdevs := getBdevCfgsFromSrvCfg(cs.srvCfg).Bdevs() + bdevCfgs := getBdevCfgsFromSrvCfg(cs.srvCfg) + nrCfgBdevs := bdevCfgs.Bdevs().Len() - if cfgBdevs.Len() == 0 { + if nrCfgBdevs == 0 { cs.log.Debugf("scan bdevs from control service as no bdevs in cfg") // No bdevs configured for engines to claim so scan through control service. - resp, err = bdevScanGlobal(cs, cfgBdevs) + resp, err = bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs) if err != nil { return nil, err } + return bdevScanTrimResults(req, resp), nil } @@ -200,28 +271,42 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n // been claimed by SPDK but details are not yet available over dRPC. var hasStarted bool - resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs) + resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs) + if err != nil { + return nil, err + } + + nrScannedBdevs, err := getEffCtrlrCount(resp.Ctrlrs) if err != nil { return nil, err } + if nrScannedBdevs == nrCfgBdevs { + return bdevScanTrimResults(req, resp), nil + } - // Retry once if global scan returns unexpected number of controllers in case engines + // Retry once if harness scan returns unexpected number of controllers in case engines // claimed devices between when started state was checked and scan was executed. - if !hasStarted && len(resp.Ctrlrs) != cfgBdevs.Len() { - cs.log.Debugf("retrying bdev scan as unexpected nr returned, want %d got %d", - cfgBdevs.Len(), len(resp.Ctrlrs)) + if !hasStarted { + cs.log.Debugf("retrying harness bdev scan as unexpected nr returned, want %d got %d", + nrCfgBdevs, nrScannedBdevs) - resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs) + resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs) if err != nil { return nil, err } - } - if len(resp.Ctrlrs) != cfgBdevs.Len() { - cs.log.Noticef("bdev scan returned unexpected nr, want %d got %d", - cfgBdevs.Len(), len(resp.Ctrlrs)) + nrScannedBdevs, err := getEffCtrlrCount(resp.Ctrlrs) + if err != nil { + return nil, err + } + if nrScannedBdevs == nrCfgBdevs { + return bdevScanTrimResults(req, resp), nil + } } + cs.log.Noticef("harness bdev scan returned unexpected nr, want %d got %d", nrCfgBdevs, + nrScannedBdevs) + return bdevScanTrimResults(req, resp), nil } @@ -253,56 +338,55 @@ func newScanScmResp(inResp *storage.ScmScanResponse, inErr error) (*ctlpb.ScanSc } // scanScm will return mount details and usage for either emulated RAM or real PMem. -func (c *ControlService) scanScm(ctx context.Context, req *ctlpb.ScanScmReq) (*ctlpb.ScanScmResp, error) { +func (cs *ControlService) scanScm(ctx context.Context, req *ctlpb.ScanScmReq) (*ctlpb.ScanScmResp, error) { if req == nil { return nil, errors.New("nil scm request") } - ssr, scanErr := c.ScmScan(storage.ScmScanRequest{}) + ssr, err := cs.ScmScan(storage.ScmScanRequest{}) + if err != nil || !req.GetUsage() { + return newScanScmResp(ssr, err) + } - if scanErr != nil || !req.GetUsage() { - return newScanScmResp(ssr, scanErr) + ssr, err = cs.getScmUsage(ssr) + if err != nil { + return nil, err + } + + resp, err := newScanScmResp(ssr, nil) + if err != nil { + return nil, err } - return newScanScmResp(c.getScmUsage(ssr)) + cs.adjustScmSize(resp) + + return resp, nil } // Returns the engine configuration managing the given NVMe controller -func (c *ControlService) getEngineCfgFromNvmeCtl(nc *ctl.NvmeController) (*engine.Config, error) { - pciAddr, err := hardware.NewPCIAddress(nc.GetPciAddr()) +func (cs *ControlService) getEngineCfgFromNvmeCtl(nc *ctlpb.NvmeController) (*engine.Config, error) { + pciAddrStr, err := ctrlrToPciStr(nc) if err != nil { - return nil, errors.Errorf("Invalid PCI address: %s", err) - } - if pciAddr.IsVMDBackingAddress() { - if pciAddr, err = pciAddr.BackingToVMDAddress(); err != nil { - return nil, errors.Errorf("Invalid VMD address: %s", err) - } + return nil, err } - ctlrAddr := pciAddr.String() - for index := range c.srvCfg.Engines { - for _, tierCfg := range c.srvCfg.Engines[index].Storage.Tiers { - if !tierCfg.IsBdev() { - continue - } - for _, devName := range tierCfg.Bdev.DeviceList.Devices() { - if devName == ctlrAddr { - return c.srvCfg.Engines[index], nil - } - } + for index := range cs.srvCfg.Engines { + if findBdevTier(pciAddrStr, cs.srvCfg.Engines[index].Storage.Tiers) != nil { + return cs.srvCfg.Engines[index], nil } } - return nil, errors.Errorf("unknown PCI device %q", pciAddr) + return nil, errors.Errorf("unknown PCI device, scanned ctrlr %q not found in cfg", + pciAddrStr) } // Returns the engine configuration managing the given SCM name-space -func (c *ControlService) getEngineCfgFromScmNsp(nsp *ctl.ScmNamespace) (*engine.Config, error) { +func (cs *ControlService) getEngineCfgFromScmNsp(nsp *ctlpb.ScmNamespace) (*engine.Config, error) { mountPoint := nsp.GetMount().Path - for index := range c.srvCfg.Engines { - for _, tierCfg := range c.srvCfg.Engines[index].Storage.Tiers { + for index := range cs.srvCfg.Engines { + for _, tierCfg := range cs.srvCfg.Engines[index].Storage.Tiers { if tierCfg.IsSCM() && tierCfg.Scm.MountPoint == mountPoint { - return c.srvCfg.Engines[index], nil + return cs.srvCfg.Engines[index], nil } } } @@ -311,10 +395,10 @@ func (c *ControlService) getEngineCfgFromScmNsp(nsp *ctl.ScmNamespace) (*engine. } // return the size of the RDB file used for managing SCM metadata -func (c *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { +func (cs *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { mdCapStr, err := engineCfg.GetEnvVar(daos.DaosMdCapEnv) if err != nil { - c.log.Debugf("using default RDB file size with engine %d: %s (%d Bytes)", + cs.log.Debugf("using default RDB file size with engine %d: %s (%d Bytes)", engineCfg.Index, humanize.Bytes(daos.DefaultDaosMdCapSize), daos.DefaultDaosMdCapSize) return uint64(daos.DefaultDaosMdCapSize), nil @@ -326,7 +410,7 @@ func (c *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { mdCapStr) } rdbSize = rdbSize << 20 - c.log.Debugf("using custom RDB size with engine %d: %s (%d Bytes)", + cs.log.Debugf("using custom RDB size with engine %d: %s (%d Bytes)", engineCfg.Index, humanize.Bytes(rdbSize), rdbSize) return rdbSize, nil @@ -370,7 +454,7 @@ func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace) } type deviceToAdjust struct { - ctlr *ctl.NvmeController + ctlr *ctlpb.NvmeController idx int rank uint32 } @@ -381,7 +465,7 @@ type deviceSizeStat struct { } // Add a device to the input map of device to which the usable size have to be adjusted -func (c *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64) { +func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64) { dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx] if devsStat[devToAdjust.rank] == nil { devsStat[devToAdjust.rank] = &deviceSizeStat{ @@ -391,10 +475,10 @@ func (c *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devsStat[devToAdjust.rank].devs = append(devsStat[devToAdjust.rank].devs, devToAdjust) targetCount := uint64(len(dev.GetTgtIds())) clusterPerTarget := dataClusterCount / targetCount - c.log.Tracef("SMD device %s (rank %d, ctlr %s) added to the list of device to adjust", + cs.log.Tracef("SMD device %s (rank %d, ctlr %s) added to the list of device to adjust", dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) if clusterPerTarget < devsStat[devToAdjust.rank].clusterPerTarget { - c.log.Tracef("Updating number of clusters per target of rank %d: old=%d new=%d", + cs.log.Tracef("Updating number of clusters per target of rank %d: old=%d new=%d", devToAdjust.rank, devsStat[devToAdjust.rank].clusterPerTarget, clusterPerTarget) devsStat[devToAdjust.rank].clusterPerTarget = clusterPerTarget } @@ -409,7 +493,7 @@ func getClusterCount(sizeBytes uint64, targetNb uint64, clusterSize uint64) uint return clusterCount * targetNb } -func (c *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdjust deviceToAdjust) (subtrClusterCount uint64) { +func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdjust deviceToAdjust) (subtrClusterCount uint64) { dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx] clusterSize := uint64(dev.GetClusterSize()) engineTargetNb := uint64(engineCfg.TargetCount) @@ -418,14 +502,14 @@ func (c *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdju // TODO DAOS-14223: GetMetaSize() should reflect custom values set through pool // create --meta-size option. clusterCount := getClusterCount(dev.GetMetaSize(), engineTargetNb, clusterSize) - c.log.Tracef("Removing %d Metadata clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", + cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } if dev.GetRoleBits()&storage.BdevRoleWAL != 0 { clusterCount := getClusterCount(dev.GetMetaWalSize(), engineTargetNb, clusterSize) - c.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", + cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } @@ -436,14 +520,14 @@ func (c *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdju if dev.GetRoleBits()&storage.BdevRoleMeta != 0 { clusterCount := getClusterCount(dev.GetRdbSize(), 1, clusterSize) - c.log.Tracef("Removing %d RDB clusters (cluster size: %d) the usable size of the SMD device %s (rank %d, ctlr %s)", + cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) the usable size of the SMD device %s (rank %d, ctlr %s)", clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } if dev.GetRoleBits()&storage.BdevRoleWAL != 0 { clusterCount := getClusterCount(dev.GetRdbWalSize(), 1, clusterSize) - c.log.Tracef("Removing %d RDB WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)", + cs.log.Tracef("Removing %d RDB WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)", clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } @@ -452,12 +536,12 @@ func (c *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdju } // Adjust the NVME available size to its real usable size. -func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { +func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { devsStat := make(map[uint32]*deviceSizeStat, 0) for _, ctlr := range resp.GetCtrlrs() { - engineCfg, err := c.getEngineCfgFromNvmeCtl(ctlr) + engineCfg, err := cs.getEngineCfgFromNvmeCtl(ctlr) if err != nil { - c.log.Noticef("Skipping NVME controller %s: %s", ctlr.GetPciAddr(), err.Error()) + cs.log.Noticef("Skipping NVME controller %s: %s", ctlr.GetPciAddr(), err.Error()) continue } @@ -465,7 +549,7 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { rank := dev.GetRank() if dev.GetRoleBits() != 0 && (dev.GetRoleBits()&storage.BdevRoleData) == 0 { - c.log.Debugf("SMD device %s (rank %d, ctlr %s) not used to store data (Role bits 0x%X)", + cs.log.Debugf("SMD device %s (rank %d, ctlr %s) not used to store data (Role bits 0x%X)", dev.GetUuid(), rank, ctlr.GetPciAddr(), dev.GetRoleBits()) dev.TotalBytes = 0 dev.AvailBytes = 0 @@ -474,7 +558,7 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { } if ctlr.GetDevState() != ctlpb.NvmeDevState_NORMAL { - c.log.Debugf("SMD device %s (rank %d, ctlr %s) not usable: device state %q", + cs.log.Debugf("SMD device %s (rank %d, ctlr %s) not usable: device state %q", dev.GetUuid(), rank, ctlr.GetPciAddr(), ctlpb.NvmeDevState_name[int32(ctlr.DevState)]) dev.AvailBytes = 0 dev.UsableBytes = 0 @@ -482,20 +566,20 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { } if dev.GetClusterSize() == 0 || len(dev.GetTgtIds()) == 0 { - c.log.Noticef("SMD device %s (rank %d, ctlr %s) not usable: missing storage info", + cs.log.Noticef("SMD device %s (rank %d, ctlr %s) not usable: missing storage info", dev.GetUuid(), rank, ctlr.GetPciAddr()) dev.AvailBytes = 0 dev.UsableBytes = 0 continue } - c.log.Tracef("Initial available size of SMD device %s (rank %d, ctlr %s): %s (%d bytes)", + cs.log.Tracef("Initial available size of SMD device %s (rank %d, ctlr %s): %s (%d bytes)", dev.GetUuid(), rank, ctlr.GetPciAddr(), humanize.Bytes(dev.GetAvailBytes()), dev.GetAvailBytes()) clusterSize := uint64(dev.GetClusterSize()) availBytes := (dev.GetAvailBytes() / clusterSize) * clusterSize if dev.GetAvailBytes() != availBytes { - c.log.Tracef("Adjusting available size of SMD device %s (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)", + cs.log.Tracef("Adjusting available size of SMD device %s (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)", dev.GetUuid(), rank, ctlr.GetPciAddr(), humanize.Bytes(dev.GetAvailBytes()), dev.GetAvailBytes(), humanize.Bytes(availBytes), availBytes) @@ -509,21 +593,21 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { } dataClusterCount := dev.GetAvailBytes() / clusterSize if dev.GetRoleBits() == 0 { - c.log.Tracef("No meta-data stored on SMD device %s (rank %d, ctlr %s)", + cs.log.Tracef("No meta-data stored on SMD device %s (rank %d, ctlr %s)", dev.GetUuid(), rank, ctlr.GetPciAddr()) - c.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) + cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) continue } - subtrClusterCount := c.getMetaClusterCount(engineCfg, devToAdjust) + subtrClusterCount := cs.getMetaClusterCount(engineCfg, devToAdjust) if subtrClusterCount >= dataClusterCount { - c.log.Debugf("No more usable space in SMD device %s (rank %d, ctlr %s)", + cs.log.Debugf("No more usable space in SMD device %s (rank %d, ctlr %s)", dev.GetUuid(), rank, ctlr.GetPciAddr()) dev.UsableBytes = 0 continue } dataClusterCount -= subtrClusterCount - c.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) + cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) } } @@ -532,7 +616,7 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { smdDev := dev.ctlr.GetSmdDevices()[dev.idx] targetCount := uint64(len(smdDev.GetTgtIds())) smdDev.UsableBytes = targetCount * item.clusterPerTarget * smdDev.GetClusterSize() - c.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) to %s (%d bytes)", + cs.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) to %s (%d bytes)", smdDev.GetUuid(), rank, dev.ctlr.GetPciAddr(), humanize.Bytes(smdDev.GetUsableBytes()), smdDev.GetUsableBytes()) } @@ -540,45 +624,45 @@ func (c *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { } // Adjust the SCM available size to the real usable size. -func (c *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { +func (cs *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { for _, scmNamespace := range resp.GetNamespaces() { mnt := scmNamespace.GetMount() mountPath := mnt.GetPath() mnt.UsableBytes = mnt.GetAvailBytes() - c.log.Debugf("Initial usable size of SCM %s: %s (%d bytes)", mountPath, + cs.log.Debugf("Initial usable size of SCM %s: %s (%d bytes)", mountPath, humanize.Bytes(mnt.GetUsableBytes()), mnt.GetUsableBytes()) - engineCfg, err := c.getEngineCfgFromScmNsp(scmNamespace) + engineCfg, err := cs.getEngineCfgFromScmNsp(scmNamespace) if err != nil { - c.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", + cs.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", mountPath, err.Error()) mnt.UsableBytes = 0 continue } - mdBytes, err := c.getRdbSize(engineCfg) + mdBytes, err := cs.getRdbSize(engineCfg) if err != nil { - c.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", + cs.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", mountPath, err.Error()) mnt.UsableBytes = 0 continue } - c.log.Tracef("Removing RDB (%s, %d bytes) from the usable size of the SCM device %q", + cs.log.Tracef("Removing RDB (%s, %d bytes) from the usable size of the SCM device %q", humanize.Bytes(mdBytes), mdBytes, mountPath) if mdBytes >= mnt.GetUsableBytes() { - c.log.Debugf("No more usable space in SCM device %s", mountPath) + cs.log.Debugf("No more usable space in SCM device %s", mountPath) mnt.UsableBytes = 0 continue } mnt.UsableBytes -= mdBytes - removeControlPlaneMetadata := func(m *ctl.ScmNamespace_Mount) { + removeControlPlaneMetadata := func(m *ctlpb.ScmNamespace_Mount) { mountPath := m.GetPath() - c.log.Tracef("Removing control plane metadata (%s, %d bytes) from the usable size of the SCM device %q", + cs.log.Tracef("Removing control plane metadata (%s, %d bytes) from the usable size of the SCM device %q", humanize.Bytes(mdDaosScmBytes), mdDaosScmBytes, mountPath) if mdDaosScmBytes >= m.GetUsableBytes() { - c.log.Debugf("No more usable space in SCM device %s", mountPath) + cs.log.Debugf("No more usable space in SCM device %s", mountPath) m.UsableBytes = 0 return } @@ -588,7 +672,7 @@ func (c *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { removeControlPlaneMetadata(mnt) } else { if !engineCfg.Storage.ControlMetadata.HasPath() { - c.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", + cs.log.Noticef("Adjusting usable size to 0 Bytes of SCM device %q: %s", mountPath, "MD on SSD feature enabled without path for Control Metadata") mnt.UsableBytes = 0 @@ -598,7 +682,7 @@ func (c *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { cmdPath := engineCfg.Storage.ControlMetadata.Path if hasPrefix, err := common.HasPrefixPath(mountPath, cmdPath); hasPrefix || err != nil { if err != nil { - c.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error()) + cs.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error()) } if hasPrefix { removeControlPlaneMetadata(mnt) @@ -606,61 +690,46 @@ func (c *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { } } - c.log.Tracef("Removing (%s, %d bytes) of usable size from the SCM device %q: space used by the file system metadata", + cs.log.Tracef("Removing (%s, %d bytes) of usable size from the SCM device %q: space used by the file system metadata", humanize.Bytes(mdFsScmBytes), mdFsScmBytes, mountPath) mnt.UsableBytes -= mdFsScmBytes usableBytes := scmNamespace.Mount.GetUsableBytes() - c.log.Debugf("Usable size of SCM device %q: %s (%d bytes)", + cs.log.Debugf("Usable size of SCM device %q: %s (%d bytes)", scmNamespace.Mount.GetPath(), humanize.Bytes(usableBytes), usableBytes) } } // StorageScan discovers non-volatile storage hardware on node. -func (c *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScanReq) (*ctlpb.StorageScanResp, error) { +func (cs *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScanReq) (*ctlpb.StorageScanResp, error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq } - resp := new(ctlpb.StorageScanResp) - - // In the case that usage stats are being requested, relevant flags for both SCM and NVMe - // will be set and so fail if engines are not ready for comms. This restriction should not - // be applied if only the Meta flag is set in the NVMe component of the request to continue - // to support off-line storage scan functionality which uses cached stats (e.g. dmg storage - // scan --nvme-meta). - // - // TODO DAOS-13228: Remove --nvme-meta scan option and the below workaround. - // If usage or meta requested, fail if no engines started and skip stopped - // engines in bdev scan. Only return results for ready engines over dRPC. - if req.Scm.Usage && req.Nvme.Meta { - nrInstances := len(c.harness.Instances()) - readyRanks := c.harness.readyRanks() - if len(readyRanks) != nrInstances { - return nil, errors.Wrapf(errEngineNotReady, "%s, ready: %v", - english.Plural(nrInstances, "engine", "engines"), - readyRanks) - } + if cs.srvCfg == nil { + return nil, errNoSrvCfg } + resp := new(ctlpb.StorageScanResp) - respScm, err := c.scanScm(ctx, req.Scm) + respScm, err := cs.scanScm(ctx, req.Scm) if err != nil { return nil, err } - if req.Scm.GetUsage() { - c.adjustScmSize(respScm) - } resp.Scm = respScm - respNvme, err := scanBdevs(ctx, c, req.Nvme, respScm.Namespaces) - if err != nil { - return nil, err - } - if req.Nvme.GetMeta() { - c.adjustNvmeSize(respNvme) + if cs.srvCfg.DisableHugepages { + cs.log.Notice("bdev scan skipped as use of hugepages disabled in config") + resp.Nvme = &ctlpb.ScanNvmeResp{ + State: new(ctlpb.ResponseState), + } + } else { + respNvme, err := scanBdevs(ctx, cs, req.Nvme, respScm.Namespaces) + if err != nil { + return nil, err + } + resp.Nvme = respNvme } - resp.Nvme = respNvme - mi, err := c.getMemInfo() + mi, err := cs.getMemInfo() if err != nil { return nil, err } @@ -671,9 +740,9 @@ func (c *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScan return resp, nil } -func (c *ControlService) formatMetadata(instances []Engine, reformat bool) (bool, error) { +func (cs *ControlService) formatMetadata(instances []Engine, reformat bool) (bool, error) { // Format control metadata first, if needed - if needs, err := c.storage.ControlMetadataNeedsFormat(); err != nil { + if needs, err := cs.storage.ControlMetadataNeedsFormat(); err != nil { return false, errors.Wrap(err, "detecting if metadata format is needed") } else if needs || reformat { engineIdxs := make([]uint, len(instances)) @@ -681,15 +750,15 @@ func (c *ControlService) formatMetadata(instances []Engine, reformat bool) (bool engineIdxs[i] = uint(eng.Index()) } - c.log.Debug("formatting control metadata storage") - if err := c.storage.FormatControlMetadata(engineIdxs); err != nil { + cs.log.Debug("formatting control metadata storage") + if err := cs.storage.FormatControlMetadata(engineIdxs); err != nil { return false, errors.Wrap(err, "formatting control metadata storage") } return true, nil } - c.log.Debug("no control metadata format needed") + cs.log.Debug("no control metadata format needed") return false, nil } @@ -821,15 +890,22 @@ type formatNvmeReq struct { mdFormatted bool } -func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageFormatResp) { +func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageFormatResp) error { // Allow format to complete on one instance even if another fails for idx, engine := range req.instances { _, hasError := req.errored[idx] _, skipped := req.skipped[idx] - if hasError || (skipped && !req.mdFormatted) { - // If scm failed to format or was already formatted, skip bdev format. + + // Skip NVMe format if scm was already formatted or failed to format. + skipReason := "" + if hasError { + skipReason = msgNvmeFormatSkipFail + } else if skipped && !req.mdFormatted { + skipReason = msgNvmeFormatSkipNotDone + } + if skipReason != "" { ret := engine.newCret(storage.NilBdevAddress, nil) - ret.State.Info = fmt.Sprintf(msgNvmeFormatSkip, engine.Index()) + ret.State.Info = fmt.Sprintf(skipReason, engine.Index()) resp.Crets = append(resp.Crets, ret) continue } @@ -849,13 +925,16 @@ func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageForma pbCtrlrs := proto.NvmeControllers(respBdevs.Ctrlrs) ctrlrs, err := pbCtrlrs.ToNative() if err != nil { - req.errored[idx] = err.Error() - resp.Crets = append(resp.Crets, engine.newCret("", err)) - continue + return errors.Wrapf(err, "convert %T to %T", pbCtrlrs, ctrlrs) + } + + ei, ok := engine.(*EngineInstance) + if !ok { + return errors.New("Engine interface obj is not an EngineInstance") } // SCM formatted correctly on this instance, format NVMe - cResults := formatEngineBdevs(engine.(*EngineInstance), ctrlrs) + cResults := formatEngineBdevs(ei, ctrlrs) if cResults.HasErrors() { req.errored[idx] = cResults.Errors() @@ -870,6 +949,8 @@ func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageForma resp.Crets = append(resp.Crets, cResults...) } + + return nil } // StorageFormat delegates to Storage implementation's Format methods to prepare @@ -880,8 +961,15 @@ func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageForma // // Send response containing multiple results of format operations on scm mounts // and nvme controllers. -func (c *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageFormatReq) (*ctlpb.StorageFormatResp, error) { - instances := c.harness.Instances() +func (cs *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageFormatReq) (*ctlpb.StorageFormatResp, error) { + if req == nil { + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + + instances := cs.harness.Instances() resp := new(ctlpb.StorageFormatResp) resp.Mrets = make([]*ctlpb.ScmMountResult, 0, len(instances)) resp.Crets = make([]*ctlpb.NvmeControllerResult, 0, len(instances)) @@ -891,50 +979,72 @@ func (c *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageFo return resp, nil } - mdFormatted, err := c.formatMetadata(instances, req.Reformat) + mdFormatted, err := cs.formatMetadata(instances, req.Reformat) if err != nil { return nil, err } fsr := formatScmReq{ - log: c.log, + log: cs.log, reformat: req.Reformat, instances: instances, - getMemInfo: c.getMemInfo, + getMemInfo: cs.getMemInfo, } + cs.log.Tracef("formatScmReq: %+v", fsr) instanceErrors, instanceSkips, err := formatScm(ctx, fsr, resp) if err != nil { return nil, err } - fnr := formatNvmeReq{ - log: c.log, - instances: instances, - errored: instanceErrors, - skipped: instanceSkips, - mdFormatted: mdFormatted, + hugepagesDisabled := false + if cs.srvCfg.DisableHugepages { + cs.log.Debug("skipping bdev format as use of hugepages disabled in config") + hugepagesDisabled = true + } else { + fnr := formatNvmeReq{ + log: cs.log, + instances: instances, + errored: instanceErrors, + skipped: instanceSkips, + mdFormatted: mdFormatted, + } + cs.log.Tracef("formatNvmeReq: %+v", fnr) + formatNvme(ctx, fnr, resp) } - formatNvme(ctx, fnr, resp) + + cs.log.Tracef("StorageFormatResp: %+v", resp) // Notify storage ready for instances formatted without error. // Block until all instances have formatted NVMe to avoid // VFIO device or resource busy when starting I/O Engines // because devices have already been claimed during format. - for idx, ei := range instances { + for idx, engine := range instances { + if hugepagesDisabled { + // Populate skip NVMe format results for all engines. + ret := engine.newCret(storage.NilBdevAddress, nil) + ret.State.Info = fmt.Sprintf(msgNvmeFormatSkipHPD, engine.Index()) + resp.Crets = append(resp.Crets, ret) + } if msg, hasError := instanceErrors[idx]; hasError { - c.log.Errorf("instance %d: %s", idx, msg) + cs.log.Errorf("instance %d: %s", idx, msg) continue } - ei.NotifyStorageReady() + engine.NotifyStorageReady() } return resp, nil } // StorageNvmeRebind rebinds SSD from kernel and binds to user-space to allow DAOS to use it. -func (c *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.NvmeRebindReq) (*ctlpb.NvmeRebindResp, error) { +func (cs *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.NvmeRebindReq) (*ctlpb.NvmeRebindResp, error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + if cs.srvCfg.DisableHugepages { + return nil, FaultHugepagesDisabled } cu, err := user.Current() @@ -951,9 +1061,9 @@ func (c *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.NvmeR } resp := new(ctlpb.NvmeRebindResp) - if _, err := c.NvmePrepare(prepReq); err != nil { + if _, err := cs.NvmePrepare(prepReq); err != nil { err = errors.Wrap(err, "nvme rebind") - c.log.Error(err.Error()) + cs.log.Error(err.Error()) resp.State = &ctlpb.ResponseState{ Error: err.Error(), @@ -969,12 +1079,18 @@ func (c *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.NvmeR // StorageNvmeAddDevice adds a newly added SSD to a DAOS engine's NVMe config to allow it to be used. // // If StorageTierIndex is set to -1 in request, add the device to the first configured bdev tier. -func (c *ControlService) StorageNvmeAddDevice(ctx context.Context, req *ctlpb.NvmeAddDeviceReq) (resp *ctlpb.NvmeAddDeviceResp, err error) { +func (cs *ControlService) StorageNvmeAddDevice(ctx context.Context, req *ctlpb.NvmeAddDeviceReq) (resp *ctlpb.NvmeAddDeviceResp, err error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + if cs.srvCfg.DisableHugepages { + return nil, FaultHugepagesDisabled } - engines := c.harness.Instances() + engines := cs.harness.Instances() engineIndex := req.GetEngineIndex() if len(engines) <= int(engineIndex) { @@ -1003,17 +1119,17 @@ func (c *ControlService) StorageNvmeAddDevice(ctx context.Context, req *ctlpb.Nv tierIndex) } - c.log.Debugf("bdev list to be updated: %+v", tierCfg.Bdev.DeviceList) + cs.log.Debugf("bdev list to be updated: %+v", tierCfg.Bdev.DeviceList) if err := tierCfg.Bdev.DeviceList.AddStrings(req.PciAddr); err != nil { return nil, errors.Errorf("updating bdev list for tier %d", tierIndex) } - c.log.Debugf("updated bdev list: %+v", tierCfg.Bdev.DeviceList) + cs.log.Debugf("updated bdev list: %+v", tierCfg.Bdev.DeviceList) // TODO: Supply scan results for VMD backing device address mapping. resp = new(ctlpb.NvmeAddDeviceResp) - if err := engineStorage.WriteNvmeConfig(ctx, c.log, nil); err != nil { + if err := engineStorage.WriteNvmeConfig(ctx, cs.log, nil); err != nil { err = errors.Wrapf(err, "write nvme config for engine %d", engineIndex) - c.log.Error(err.Error()) + cs.log.Error(err.Error()) // report write conf call result in response resp.State = &ctlpb.ResponseState{ diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index 3fb19e803003..4df8b2b0e653 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -28,6 +28,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/provider/system" "github.com/daos-stack/daos/src/control/server/config" @@ -59,6 +60,7 @@ var ( func TestServer_bdevScan(t *testing.T) { for name, tc := range map[string]struct { req *ctlpb.ScanNvmeReq + disableHPs bool provRes *storage.BdevScanResponse provErr error engTierCfgs []storage.TierConfigs // one per-engine @@ -70,17 +72,22 @@ func TestServer_bdevScan(t *testing.T) { expBackendScanCalls []storage.BdevScanRequest }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, }, - "no bdevs in config; scan local fails": { - req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + "hugepages disabled": { + req: &ctlpb.ScanNvmeReq{}, + disableHPs: true, + expErr: errors.New("hugepages have been disabled"), + }, + "scan local; no bdevs in config; scan fails": { + req: &ctlpb.ScanNvmeReq{Health: true}, engTierCfgs: []storage.TierConfigs{{}}, provErr: errors.New("fail"), engStopped: []bool{false}, expErr: errors.New("fail"), }, - "no bdevs in config; scan local; devlist passed to backend": { - req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + "scan local; no bdevs in config; devlist passed to backend": { + req: &ctlpb.ScanNvmeReq{Health: true}, engTierCfgs: []storage.TierConfigs{{}}, engStopped: []bool{false}, expResp: &ctlpb.ScanNvmeResp{ @@ -93,7 +100,20 @@ func TestServer_bdevScan(t *testing.T) { {DeviceList: new(storage.BdevDeviceList)}, }, }, - "bdevs in config; engine not started; scan local; devlist passed to backend": { + // This should succeed so nil NVMe stats can be returned in SCM-only scenarios. + "scan local; no bdevs in config; meta requested": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{{}}, + engStopped: []bool{false}, + provRes: &storage.BdevScanResponse{}, + expResp: &ctlpb.ScanNvmeResp{ + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + {DeviceList: new(storage.BdevDeviceList)}, + }, + }, + "scan local; bdevs in config; meta requested": { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { @@ -103,6 +123,19 @@ func TestServer_bdevScan(t *testing.T) { test.MockPCIAddr(2)), }, }, + engStopped: []bool{true}, + expErr: errors.New("info unavailable"), + }, + "scan local; bdevs in config; devlist passed to backend; no roles": { + req: &ctlpb.ScanNvmeReq{Health: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, provRes: &storage.BdevScanResponse{ Controllers: storage.NvmeControllers{ storage.MockNvmeController(1), @@ -112,8 +145,26 @@ func TestServer_bdevScan(t *testing.T) { engStopped: []bool{true}, expResp: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ - proto.MockNvmeController(1), - proto.MockNvmeController(2), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(1) + c.SmdDevices = []*ctlpb.SmdDevice{ + { + Rank: uint32(ranklist.NilRank), + RoleBits: 0, // No roles. + }, + } + return c + }(), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.SmdDevices = []*ctlpb.SmdDevice{ + { + Rank: uint32(ranklist.NilRank), + RoleBits: 0, // No roles. + }, + } + return c + }(), }, State: new(ctlpb.ResponseState), }, @@ -124,8 +175,61 @@ func TestServer_bdevScan(t *testing.T) { }, }, }, - "bdevs in config; engine not started; scan local; retry on empty response": { - req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + "scan local; bdevs in config; devlist passed to backend; roles from cfg": { + req: &ctlpb.ScanNvmeReq{Health: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleWAL), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(2)). + WithBdevDeviceRoles(storage.BdevRoleMeta | storage.BdevRoleData), + }, + }, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + storage.MockNvmeController(1), + storage.MockNvmeController(2), + }, + }, + engStopped: []bool{true}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(1) + c.SmdDevices = []*ctlpb.SmdDevice{ + { + Rank: uint32(ranklist.NilRank), + RoleBits: uint32(storage.BdevRoleWAL), + }, + } + return c + }(), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.SmdDevices = []*ctlpb.SmdDevice{ + { + Rank: uint32(ranklist.NilRank), + RoleBits: uint32(storage.BdevRoleMeta | storage.BdevRoleData), + }, + } + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + expBackendScanCalls: []storage.BdevScanRequest{ + { + DeviceList: storage.MustNewBdevDeviceList( + test.MockPCIAddr(1), test.MockPCIAddr(2)), + }, + }, + }, + "scan local; bdevs in config; devlist passed to backend; retry on empty response": { + req: &ctlpb.ScanNvmeReq{Health: true}, engTierCfgs: []storage.TierConfigs{ { storage.NewTierConfig(). @@ -153,7 +257,7 @@ func TestServer_bdevScan(t *testing.T) { }, }, }, - "bdevs in config; engine started; scan remote": { + "scan remote; bdevs in config": { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { @@ -364,8 +468,8 @@ func TestServer_bdevScan(t *testing.T) { }, }, }, - "bdevs in config; engine not started; scan local; vmd enabled": { - req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + "scan local; bdevs in config; vmd enabled": { + req: &ctlpb.ScanNvmeReq{}, engTierCfgs: []storage.TierConfigs{ { storage.NewTierConfig(). @@ -382,18 +486,35 @@ func TestServer_bdevScan(t *testing.T) { engStopped: []bool{true}, expResp: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ - &ctlpb.NvmeController{PciAddr: "050505:01:00.0"}, - &ctlpb.NvmeController{PciAddr: "050505:03:00.0"}, + func() *ctlpb.NvmeController { + nc := &ctlpb.NvmeController{ + PciAddr: "050505:01:00.0", + DevState: ctlpb.NvmeDevState_NORMAL, + } + nc.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return nc + }(), + func() *ctlpb.NvmeController { + nc := &ctlpb.NvmeController{ + PciAddr: "050505:03:00.0", + DevState: ctlpb.NvmeDevState_NORMAL, + } + nc.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return nc + }(), }, State: new(ctlpb.ResponseState), }, expBackendScanCalls: []storage.BdevScanRequest{ {DeviceList: storage.MustNewBdevDeviceList("0000:05:05.5")}, - {DeviceList: storage.MustNewBdevDeviceList("0000:05:05.5")}, }, }, - "bdevs in config; engine started; scan remote; vmd enabled": { - req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + "scan remote; bdevs in config; vmd enabled": { + req: &ctlpb.ScanNvmeReq{Meta: true}, engTierCfgs: []storage.TierConfigs{ { storage.NewTierConfig(). @@ -457,7 +578,8 @@ func TestServer_bdevScan(t *testing.T) { engCfg := engine.MockConfig().WithStorage(tcs...) engCfgs = append(engCfgs, engCfg) } - sCfg := config.DefaultServer().WithEngines(engCfgs...) + sCfg := config.DefaultServer().WithEngines(engCfgs...). + WithDisableHugepages(tc.disableHPs) bmbc := &bdev.MockBackendConfig{ ScanRes: tc.provRes, @@ -528,9 +650,20 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { smbc *scm.MockBackendConfig tierCfgs storage.TierConfigs enginesNotReady bool + disableHPs bool + noSrvCfg bool + nilReq bool expResp *ctlpb.StorageScanResp expErr error }{ + "nil request": { + nilReq: true, + expErr: errNilReq, + }, + "missing server config": { + noSrvCfg: true, + expErr: errNoSrvCfg, + }, "successful scan; scm namespaces": { bdevScanRes: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ @@ -544,6 +677,10 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace()}, }, tierCfgs: storage.TierConfigs{ + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmMountPoint("/mnt/daos0"). + WithScmDeviceList("/dev/pmem0"), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(ctrlr.PciAddr, test.MockPCIAddr(2)), @@ -644,6 +781,29 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { MemInfo: proto.MockPBMemInfo(), }, }, + "hugepages disabled": { + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, + }, + State: new(ctlpb.ResponseState), + }, + smbc: &scm.MockBackendConfig{ + GetModulesRes: storage.ScmModules{storage.MockScmModule()}, + GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace()}, + }, + disableHPs: true, + expResp: &ctlpb.StorageScanResp{ + Nvme: &ctlpb.ScanNvmeResp{ + State: &ctlpb.ResponseState{}, + }, + Scm: &ctlpb.ScanScmResp{ + Namespaces: proto.ScmNamespaces{proto.MockScmNamespace()}, + State: new(ctlpb.ResponseState), + }, + MemInfo: proto.MockPBMemInfo(), + }, + }, "scm module discovery failure": { bdevScanRes: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ @@ -719,7 +879,7 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { MemInfo: proto.MockPBMemInfo(), }, }, - "scan usage": { + "scan usage; engines not ready": { req: &ctlpb.StorageScanReq{ Scm: &ctlpb.ScanScmReq{ Usage: true, @@ -729,7 +889,7 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { }, }, enginesNotReady: true, - expErr: errEngineNotReady, + expErr: errors.New("no scm details found"), }, } { t.Run(name, func(t *testing.T) { @@ -738,7 +898,8 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { engineCfg := engine.MockConfig().WithStorage(tc.tierCfgs...) engineCfgs := []*engine.Config{engineCfg} - sCfg := config.DefaultServer().WithEngines(engineCfgs...) + sCfg := config.DefaultServer().WithEngines(engineCfgs...). + WithDisableHugepages(tc.disableHPs) var cs *ControlService if tc.enginesNotReady { @@ -754,12 +915,15 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { scanBdevs = bdevScan }() - if tc.req == nil { + if tc.req == nil && !tc.nilReq { tc.req = &ctlpb.StorageScanReq{ Scm: new(ctlpb.ScanScmReq), Nvme: new(ctlpb.ScanNvmeReq), } } + if tc.noSrvCfg { + cs.srvCfg = nil + } resp, err := cs.StorageScan(test.Context(t), tc.req) test.CmpErr(t, tc.expErr, err) @@ -897,11 +1061,31 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { bmbcs []*bdev.MockBackendConfig awaitTimeout time.Duration getMemInfo func() (*common.MemInfo, error) + disableHPs bool + nilReq bool + noSrvCfg bool expAwaitExit bool expAwaitErr error expResp *ctlpb.StorageFormatResp + expErr error reformat bool // indicates setting of reformat parameter }{ + "nil request": { + nilReq: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{}, + Mrets: []*ctlpb.ScmMountResult{}, + }, + expErr: errNilReq, + }, + "missing server config": { + noSrvCfg: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{}, + Mrets: []*ctlpb.ScmMountResult{}, + }, + expErr: errNoSrvCfg, + }, "ram no nvme": { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassRam, @@ -932,6 +1116,48 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { }, }, }, + "nvme and ram; use of hugepages disabled": { + sMounts: []string{"/mnt/daos"}, + sClass: storage.ClassRam, + sDevs: []string{"/dev/pmem1"}, // ignored if SCM class is ram + sSize: 6, + bClass: storage.ClassNvme, + bDevs: [][]string{{mockNvmeController0.PciAddr}}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + mockNvmeController0, + }, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, + }, + }, + }, + }, + disableHPs: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{ + { + PciAddr: storage.NilBdevAddress, + State: &ctlpb.ResponseState{ + Status: ctlpb.ResponseStatus_CTL_SUCCESS, + Info: fmt.Sprintf(msgNvmeFormatSkipHPD, 0), + }, + }, + }, + Mrets: []*ctlpb.ScmMountResult{ + { + Mntpoint: "/mnt/daos", + State: new(ctlpb.ResponseState), + }, + }, + }, + }, "nvme and ram": { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassRam, @@ -1062,7 +1288,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1100,7 +1327,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1212,7 +1440,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1384,7 +1613,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { } } - config := config.DefaultServer() + config := config.DefaultServer().WithDisableHugepages(tc.disableHPs) // validate test parameters if len(tc.sDevs) > 0 { @@ -1559,7 +1788,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { t.Log("rx on awaitCh from unusual awaitStorageReady() returns") test.CmpErr(t, tc.expAwaitErr, err) if !tc.expAwaitExit { - t.Fatal("unexpected exit from awaitStorageReady()") + t.Fatalf("unexpected exit from awaitStorageReady()") } case <-ctx.Done(): t.Logf("context done (%s)", ctx.Err()) @@ -1572,11 +1801,20 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { } } - resp, fmtErr := cs.StorageFormat(test.Context(t), &ctlpb.StorageFormatReq{ - Reformat: tc.reformat, - }) + var req *ctlpb.StorageFormatReq + if !tc.nilReq { + req = &ctlpb.StorageFormatReq{ + Reformat: tc.reformat, + } + } + if tc.noSrvCfg { + cs.srvCfg = nil + } + + resp, fmtErr := cs.StorageFormat(test.Context(t), req) + test.CmpErr(t, tc.expErr, fmtErr) if fmtErr != nil { - t.Fatal(fmtErr) + return } test.AssertEqual(t, len(tc.expResp.Crets), len(resp.Crets), @@ -1618,12 +1856,21 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { for name, tc := range map[string]struct { req *ctlpb.NvmeRebindReq bmbc *bdev.MockBackendConfig + disableHPs bool + noSrvCfg bool expErr error expResp *ctlpb.NvmeRebindResp expPrepCall *storage.BdevPrepareRequest }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, + }, + "missing server config": { + req: &ctlpb.NvmeRebindReq{ + PciAddr: test.MockPCIAddr(1), + }, + noSrvCfg: true, + expErr: errNoSrvCfg, }, "failure": { req: &ctlpb.NvmeRebindReq{ @@ -1643,6 +1890,16 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { }, }, }, + "hugepages disabled": { + req: &ctlpb.NvmeRebindReq{ + PciAddr: test.MockPCIAddr(1), + }, + disableHPs: true, + bmbc: &bdev.MockBackendConfig{ + PrepareErr: errors.New("failure"), + }, + expErr: FaultHugepagesDisabled, + }, "success": { req: &ctlpb.NvmeRebindReq{ PciAddr: test.MockPCIAddr(1), @@ -1665,6 +1922,11 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { scm.NewMockProvider(log, nil, nil), mbp, nil) cs := &ControlService{StorageControlService: *scs} + if !tc.noSrvCfg { + cs.srvCfg = config.DefaultServer(). + WithDisableHugepages(tc.disableHPs) + } + resp, err := cs.StorageNvmeRebind(test.Context(t), tc.req) mbb.RLock() @@ -1699,12 +1961,21 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { req *ctlpb.NvmeAddDeviceReq bmbc *bdev.MockBackendConfig storageCfgs []storage.TierConfigs + disableHPs bool + noSrvCfg bool expErr error expDevList []string expResp *ctlpb.NvmeAddDeviceResp }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, + }, + "missing server config": { + req: &ctlpb.NvmeAddDeviceReq{ + PciAddr: test.MockPCIAddr(1), + }, + noSrvCfg: true, + expErr: errNoSrvCfg, }, "missing engine index 0": { req: &ctlpb.NvmeAddDeviceReq{ @@ -1741,6 +2012,20 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { }, expErr: errors.New("no bdev storage tiers"), }, + "hugepages disabled": { + req: &ctlpb.NvmeAddDeviceReq{ + PciAddr: test.MockPCIAddr(1), + StorageTierIndex: -1, + }, + disableHPs: true, + storageCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()), + }, + }, + expErr: FaultHugepagesDisabled, + }, "missing bdev config index 0": { req: &ctlpb.NvmeAddDeviceReq{ PciAddr: test.MockPCIAddr(1), @@ -1980,9 +2265,12 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { ec.Index = uint32(idx) engineCfgs = append(engineCfgs, ec) } - serverCfg := config.DefaultServer().WithEngines(engineCfgs...) - + serverCfg := config.DefaultServer().WithEngines(engineCfgs...). + WithDisableHugepages(tc.disableHPs) cs := mockControlService(t, log, serverCfg, tc.bmbc, nil, nil) + if tc.noSrvCfg { + cs.srvCfg = nil + } resp, err := cs.StorageNvmeAddDevice(test.Context(t), tc.req) test.CmpErr(t, tc.expErr, err) diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index e73e0a08cdb0..d70c02073112 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -543,6 +543,14 @@ func (c *Config) WithStorageSpdkRpcSrvProps(enable bool, sockAddr string) *Confi return c } +// WithStorageAutoFaultyCriteria specifies NVMe auto-faulty settings in the I/O Engine. +func (c *Config) WithStorageAutoFaultyCriteria(enable bool, maxIoErrs, maxCsumErrs uint32) *Config { + c.Storage.AutoFaultyProps.Enable = enable + c.Storage.AutoFaultyProps.MaxIoErrs = maxIoErrs + c.Storage.AutoFaultyProps.MaxCsumErrs = maxCsumErrs + return c +} + // WithIndex sets the I/O Engine instance index. func (c *Config) WithIndex(i uint32) *Config { c.Index = i diff --git a/src/control/server/faults.go b/src/control/server/faults.go index ffadcaa93c1a..daf569fd7370 100644 --- a/src/control/server/faults.go +++ b/src/control/server/faults.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -56,6 +56,11 @@ var ( "cannot destroy a pool with existing containers", "retry the operation with the recursive flag set to remove containers along with the pool", ) + FaultHugepagesDisabled = serverFault( + code.ServerHugepagesDisabled, + "the use of hugepages has been disabled in the server config", + "set false (or remove) disable_hugepages parameter in config and reformat storage, then retry the operation", + ) ) func FaultPoolInvalidServiceReps(maxSvcReps uint32) *fault.Fault { diff --git a/src/control/server/harness.go b/src/control/server/harness.go index ae90d3ed711d..af8eb206ce51 100644 --- a/src/control/server/harness.go +++ b/src/control/server/harness.go @@ -62,6 +62,7 @@ type Engine interface { OnReady(...onReadyFn) GetStorage() *storage.Provider Debugf(format string, args ...interface{}) + Tracef(format string, args ...interface{}) } // EngineHarness is responsible for managing Engine instances. diff --git a/src/control/server/instance.go b/src/control/server/instance.go index b32831f03b98..e430d32913a7 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -274,6 +274,11 @@ func (ei *EngineInstance) handleReady(ctx context.Context, ready *srvpb.NotifyRe } func (ei *EngineInstance) SetupRank(ctx context.Context, rank ranklist.Rank, map_version uint32) error { + if ei.IsReady() { + ei.log.Debugf("SetupRank called on an already set-up instance %d", ei.Index()) + return nil + } + if err := ei.callSetRank(ctx, rank, map_version); err != nil { return errors.Wrap(err, "SetRank failed") } @@ -366,3 +371,7 @@ func (ei *EngineInstance) callSetUp(ctx context.Context) error { func (ei *EngineInstance) Debugf(format string, args ...interface{}) { ei.log.Debugf(format, args...) } + +func (ei *EngineInstance) Tracef(format string, args ...interface{}) { + ei.log.Tracef(format, args...) +} diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index 7b2b38cc57a0..8cc363042287 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -118,7 +118,7 @@ func (ei *EngineInstance) awaitStorageReady(ctx context.Context) error { if !needsMetaFormat && !needsScmFormat { ei.log.Debugf("instance %d: no SCM format required; checking for superblock", idx) - needsSuperblock, err := ei.NeedsSuperblock() + needsSuperblock, err := ei.needsSuperblock() if err != nil { ei.log.Errorf("instance %d: failed to check instance superblock: %s", idx, err) } diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index 9ce68660f254..d6f66be81dd9 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -9,6 +9,7 @@ package server import ( "context" "fmt" + "sort" "time" "github.com/pkg/errors" @@ -18,6 +19,7 @@ import ( "github.com/daos-stack/daos/src/control/common/proto" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" "github.com/daos-stack/daos/src/control/fault" + "github.com/daos-stack/daos/src/control/lib/hardware" "github.com/daos-stack/daos/src/control/server/storage" ) @@ -81,9 +83,9 @@ func (ei *EngineInstance) scmFormat(force bool) (*ctlpb.ScmMountResult, error) { func formatEngineBdevs(ei *EngineInstance, ctrlrs storage.NvmeControllers) (results proto.NvmeControllerResults) { // If no superblock exists, format NVMe and populate response with results. - needsSuperblock, err := ei.NeedsSuperblock() + needsSuperblock, err := ei.needsSuperblock() if err != nil { - ei.log.Errorf("engine storage for %s instance %d: NeedsSuperblock(): %s", + ei.log.Errorf("engine storage for %s instance %d: needsSuperblock(): %s", build.DataPlaneName, ei.Index(), err) return proto.NvmeControllerResults{ @@ -100,19 +102,22 @@ func formatEngineBdevs(ei *EngineInstance, ctrlrs storage.NvmeControllers) (resu for _, tr := range ei.storage.FormatBdevTiers(ctrlrs) { if tr.Error != nil { - results = append(results, ei.newCret(fmt.Sprintf("tier %d", tr.Tier), tr.Error)) + results = append(results, ei.newCret(fmt.Sprintf("tier %d", tr.Tier), + tr.Error)) continue } for devAddr, status := range tr.Result.DeviceResponses { - ei.log.Debugf("instance %d: tier %d: device fmt of %s, status %+v", - ei.Index(), tr.Tier, devAddr, status) + ei.log.Debugf("instance %d: tier %d: device fmt of %s, status %+v, roles %q", + ei.Index(), tr.Tier, devAddr, status, tr.DeviceRoles) // TODO DAOS-5828: passing status.Error directly triggers segfault var err error if status.Error != nil { err = status.Error } - results = append(results, ei.newCret(devAddr, err)) + res := ei.newCret(devAddr, err) + res.RoleBits = uint32(tr.DeviceRoles.OptionBits) + results = append(results, res) } } @@ -159,17 +164,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe } func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController) (bool, error) { - state := ctrlr.DevState - if state != ctlpb.NvmeDevState_NORMAL && state != ctlpb.NvmeDevState_EVICTED { + stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)] + if !ctrlr.CanSupplyHealthStats() { engine.Debugf("skip fetching health stats on device %q in %q state", - ctrlr.PciAddr, ctlpb.NvmeDevState_name[int32(state)]) + ctrlr.PciAddr, stateName) return false, nil } health, err := getCtrlrHealth(ctx, engine, req) if err != nil { return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr, - state) + stateName) } ctrlr.HealthStats = health @@ -206,14 +211,31 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc c.SmdDevices = nil c.HealthStats = nil seenCtrlrs[addr] = c - pbResp.Ctrlrs = append(pbResp.Ctrlrs, c) } c := seenCtrlrs[addr] + // Only minimal info provided in standard scan to enable result aggregation across + // homogeneous hosts. + engineRank, err := engine.GetRank() + if err != nil { + return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) + } + nsd := &ctlpb.SmdDevice{ + RoleBits: sd.RoleBits, + CtrlrNamespaceId: sd.CtrlrNamespaceId, + Rank: engineRank.Uint32(), + } + + if !sd.Ctrlr.IsScannable() { + engine.Debugf("smd %q partial update of ctrlr %+v with bad state", + sd.Uuid, sd.Ctrlr) + continue + } + // Populate health if requested. healthUpdated := false - if pbReq.Health { + if pbReq.Health && c.HealthStats == nil { bhReq := &ctlpb.BioHealthReq{ DevUuid: sd.Uuid, MetaSize: pbReq.MetaSize, @@ -226,11 +248,11 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc healthUpdated = upd } - // Populate SMD (meta) if requested. + // Populate usage data if requested. if pbReq.Meta { - nsd := new(ctlpb.SmdDevice) *nsd = *sd nsd.Ctrlr = nil + nsd.Rank = engineRank.Uint32() nsd.MetaSize = pbReq.MetaSize nsd.RdbSize = pbReq.RdbSize if healthUpdated { @@ -241,79 +263,127 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc nsd.MetaWalSize = c.HealthStats.MetaWalSize nsd.RdbWalSize = c.HealthStats.RdbWalSize } - engineRank, err := engine.GetRank() - if err != nil { - return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) - } - nsd.Rank = engineRank.Uint32() - c.SmdDevices = append(c.SmdDevices, nsd) } + + c.SmdDevices = append(c.SmdDevices, nsd) + } + + var keys []string + for k := range seenCtrlrs { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + c := seenCtrlrs[k] + engine.Tracef("bdev discovered: %+v", c) + pbResp.Ctrlrs = append(pbResp.Ctrlrs, c) } return &pbResp, nil } -func bdevScanEngineAssigned(ctx context.Context, engine Engine, pbReq *ctlpb.ScanNvmeReq, devList *storage.BdevDeviceList, isStarted *bool) (*ctlpb.ScanNvmeResp, error) { +func bdevScanEngineAssigned(ctx context.Context, engine Engine, req *ctlpb.ScanNvmeReq, bdevCfgs storage.TierConfigs, isStarted *bool) (*ctlpb.ScanNvmeResp, error) { *isStarted = engine.IsStarted() if !*isStarted { - engine.Debugf("scanning engine-%d bdev tiers while engine is down", engine.Index()) - - // Retrieve engine cfg bdevs to restrict scan scope. - req := storage.BdevScanRequest{DeviceList: devList} + engine.Debugf("scanning engine-%d bdevs while engine is down", engine.Index()) + if req.Meta { + return nil, errors.New("meta smd usage info unavailable as engine stopped") + } - return bdevScanToProtoResp(engine.GetStorage().ScanBdevs, req) + return bdevScanToProtoResp(engine.GetStorage().ScanBdevs, bdevCfgs) } - engine.Debugf("scanning engine-%d bdev tiers while engine is up", engine.Index()) + engine.Debugf("scanning engine-%d bdevs while engine is up", engine.Index()) - // If engine is started but not ready, wait for ready state. If partial number of engines - // return results, indicate errors for non-ready engines whilst returning successful scan - // results. + // If engine is started but not ready, wait for ready state. pollFn := func(e Engine) bool { return e.IsReady() } if err := pollInstanceState(ctx, []Engine{engine}, pollFn); err != nil { return nil, errors.Wrapf(err, "waiting for engine %d to be ready to receive drpcs", engine.Index()) } - return scanEngineBdevsOverDrpc(ctx, engine, pbReq) + return scanEngineBdevsOverDrpc(ctx, engine, req) +} + +func getEffCtrlrCount(ctrlrs []*ctlpb.NvmeController) (int, error) { + pas := hardware.MustNewPCIAddressSet() + for _, c := range ctrlrs { + if err := pas.AddStrings(c.PciAddr); err != nil { + return 0, err + } + } + if pas.HasVMD() { + if npas, err := pas.BackingToVMDAddresses(); err != nil { + return 0, err + } else { + pas = npas + } + } + + return pas.Len(), nil } // bdevScanEngine calls either in to the private engine storage provider to scan bdevs if engine process // is not started, otherwise dRPC is used to retrieve details from the online engine. -func bdevScanEngine(ctx context.Context, engine Engine, req *ctlpb.ScanNvmeReq) (resp *ctlpb.ScanNvmeResp, err error) { +func bdevScanEngine(ctx context.Context, engine Engine, req *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) { if req == nil { return nil, errors.New("nil request") } - eCfgBdevs := storage.TierConfigs(engine.GetStorage().GetBdevConfigs()).Bdevs() - if eCfgBdevs.Len() == 0 { + bdevCfgs := storage.TierConfigs(engine.GetStorage().GetBdevConfigs()) + nrCfgBdevs := bdevCfgs.Bdevs().Len() + + if nrCfgBdevs == 0 { return nil, errEngineBdevScanEmptyDevList } var isStarted bool - resp, err = bdevScanEngineAssigned(ctx, engine, req, eCfgBdevs, &isStarted) + resp, err := bdevScanEngineAssigned(ctx, engine, req, bdevCfgs, &isStarted) + if err != nil { + return nil, err + } + + // Compare number of VMD domain addresses rather than the number of backing devices found + // behind it as the domain is what is specified in the server config file. + nrBdevs, err := getEffCtrlrCount(resp.Ctrlrs) if err != nil { return nil, err } // Retry once if engine provider scan returns unexpected number of controllers in case // engines claimed devices between when started state was checked and scan was executed. - if !isStarted && len(resp.Ctrlrs) != eCfgBdevs.Len() { + if nrBdevs != nrCfgBdevs && !isStarted { engine.Debugf("retrying engine bdev scan as unexpected nr returned, want %d got %d", - eCfgBdevs.Len(), len(resp.Ctrlrs)) + nrCfgBdevs, nrBdevs) + + resp, err = bdevScanEngineAssigned(ctx, engine, req, bdevCfgs, &isStarted) + if err != nil { + return nil, err + } - resp, err = bdevScanEngineAssigned(ctx, engine, req, eCfgBdevs, &isStarted) + nrBdevs, err = getEffCtrlrCount(resp.Ctrlrs) if err != nil { return nil, err } } - if len(resp.Ctrlrs) != eCfgBdevs.Len() { + if nrBdevs != nrCfgBdevs { engine.Debugf("engine bdev scan returned unexpected nr, want %d got %d", - eCfgBdevs.Len(), len(resp.Ctrlrs)) + nrCfgBdevs, nrBdevs) } - return + // Filter devices in an unusable state from the response. + outCtrlrs := make([]*ctlpb.NvmeController, 0, len(resp.Ctrlrs)) + for _, c := range resp.Ctrlrs { + if c.IsScannable() { + outCtrlrs = append(outCtrlrs, c) + } else { + engine.Tracef("excluding bdev from scan results: %+v", c) + } + } + resp.Ctrlrs = outCtrlrs + + return resp, nil } func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq) (*ctlpb.SmdQueryResp_RankResp, error) { diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go index dfa0a28bd3bf..bd63a56d1d68 100644 --- a/src/control/server/instance_storage_rpc_test.go +++ b/src/control/server/instance_storage_rpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2023 Intel Corporation. +// (C) Copyright 2023-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -16,6 +16,7 @@ import ( "github.com/daos-stack/daos/src/control/common/proto" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" "github.com/daos-stack/daos/src/control/server/engine" @@ -26,12 +27,20 @@ import ( func TestIOEngineInstance_bdevScanEngine(t *testing.T) { c := storage.MockNvmeController(2) + withState := func(ctrlr *ctlpb.NvmeController, state ctlpb.NvmeDevState) *ctlpb.NvmeController { + ctrlr.DevState = state + ctrlr.HealthStats = nil + // scanEngineBdevsOverDrpc will always populate RoleBits in ctrlr.SmdDevices + ctrlr.SmdDevices = []*ctlpb.SmdDevice{{RoleBits: 7}} + return ctrlr + } + withDevState := func(smd *ctlpb.SmdDevice, state ctlpb.NvmeDevState) *ctlpb.SmdDevice { + smd.Ctrlr.DevState = state + return smd + } defSmdScanRes := func() *ctlpb.SmdDevResp { - return &ctlpb.SmdDevResp{ - Devices: []*ctlpb.SmdDevice{ - proto.MockSmdDevice(c, 2), - }, - } + sd := proto.MockSmdDevice(c, 2) + return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}} } healthRespWithUsage := func() *ctlpb.BioHealthResp { mh := proto.MockNvmeHealth(2) @@ -43,6 +52,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { for name, tc := range map[string]struct { req ctlpb.ScanNvmeReq bdevAddrs []string + rank int provRes *storage.BdevScanResponse provErr error engStopped bool @@ -69,8 +79,20 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { }, expResp: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ - proto.MockNvmeController(1), - proto.MockNvmeController(2), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(1) + c.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return c + }(), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return c + }(), }, State: new(ctlpb.ResponseState), }, @@ -86,7 +108,13 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { engStopped: true, expResp: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ - proto.MockNvmeController(1), + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(1) + c.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return c + }(), }, State: new(ctlpb.ResponseState), }, @@ -106,27 +134,81 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { provErr: errors.New("provider scan fail"), expErr: errors.New("provider scan fail"), }, - "scan over drpc; no health or meta": { - smdRes: defSmdScanRes(), - healthRes: proto.MockNvmeHealth(2), + "engines stopped; scan over engine provider; vmd enabled": { + bdevAddrs: []string{"0000:05:05.5"}, + engStopped: true, + provRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + &storage.NvmeController{ + PciAddr: "050505:01:00.0", + NvmeState: storage.NvmeStateNormal, + }, + &storage.NvmeController{ + PciAddr: "050505:03:00.0", + NvmeState: storage.NvmeStateNormal, + }, + }, + }, expResp: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ func() *ctlpb.NvmeController { - c := proto.MockNvmeController(2) - c.HealthStats = nil - c.SmdDevices = nil - return c + nc := &ctlpb.NvmeController{ + PciAddr: "050505:01:00.0", + DevState: ctlpb.NvmeDevState_NORMAL, + } + nc.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return nc + }(), + func() *ctlpb.NvmeController { + nc := &ctlpb.NvmeController{ + PciAddr: "050505:03:00.0", + DevState: ctlpb.NvmeDevState_NORMAL, + } + nc.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: uint32(ranklist.NilRank)}, + } + return nc }(), }, State: new(ctlpb.ResponseState), }, + expBackendScanCalls: []storage.BdevScanRequest{ + {DeviceList: storage.MustNewBdevDeviceList("0000:05:05.5")}, + }, }, "scan fails over drpc": { smdErr: errors.New("drpc fail"), expErr: errors.New("drpc fail"), }, + "scan over drpc; no req flags; rank and roles populated": { + req: ctlpb.ScanNvmeReq{}, + rank: 1, + smdRes: defSmdScanRes(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = nil + c.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: 1, RoleBits: storage.BdevRoleAll}, + } + return c + }(), + }, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; no req flags; invalid rank": { + req: ctlpb.ScanNvmeReq{}, + rank: -1, + smdRes: defSmdScanRes(), + expErr: errors.New("nil superblock"), + }, "scan over drpc; with health": { req: ctlpb.ScanNvmeReq{Health: true}, + rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ @@ -134,15 +216,18 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { func() *ctlpb.NvmeController { c := proto.MockNvmeController(2) c.HealthStats = healthRespWithUsage() - c.SmdDevices = nil + c.SmdDevices = []*ctlpb.SmdDevice{ + {Rank: 1, RoleBits: storage.BdevRoleAll}, + } return c }(), }, State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd": { + "scan over drpc; with meta": { req: ctlpb.ScanNvmeReq{Meta: true}, + rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ @@ -150,9 +235,9 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { func() *ctlpb.NvmeController { c := proto.MockNvmeController(2) c.HealthStats = nil - c.SmdDevices = []*ctlpb.SmdDevice{ - proto.MockSmdDevice(nil, 2), - } + sd := proto.MockSmdDevice(nil, 2) + sd.Rank = 1 + c.SmdDevices = []*ctlpb.SmdDevice{sd} return c }(), }, @@ -161,6 +246,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { }, "scan over drpc; with smd and health; usage and wal size reported": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ @@ -169,6 +255,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { c := proto.MockNvmeController(2) c.HealthStats = healthRespWithUsage() sd := proto.MockSmdDevice(nil, 2) + sd.Rank = 1 sd.TotalBytes = c.HealthStats.TotalBytes sd.AvailBytes = c.HealthStats.AvailBytes sd.ClusterSize = c.HealthStats.ClusterSize @@ -181,6 +268,45 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, + "scan over drpc; only ctrlrs with valid states shown": { + req: ctlpb.ScanNvmeReq{}, + bdevAddrs: []string{ + test.MockPCIAddr(1), test.MockPCIAddr(2), + test.MockPCIAddr(1), test.MockPCIAddr(2), + test.MockPCIAddr(5), + }, + smdRes: &ctlpb.SmdDevResp{ + Devices: proto.SmdDevices{ + withDevState(proto.MockSmdDevice( + storage.MockNvmeController(1), 1), + ctlpb.NvmeDevState_UNPLUGGED), + withDevState(proto.MockSmdDevice( + storage.MockNvmeController(2), 2), + ctlpb.NvmeDevState_UNKNOWN), + withDevState(proto.MockSmdDevice( + storage.MockNvmeController(3), 3), + ctlpb.NvmeDevState_NORMAL), + withDevState(proto.MockSmdDevice( + storage.MockNvmeController(4), 4), + ctlpb.NvmeDevState_NEW), + withDevState(proto.MockSmdDevice( + storage.MockNvmeController(5), 5), + ctlpb.NvmeDevState_EVICTED), + }, + }, + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + withState(proto.MockNvmeController(3), + ctlpb.NvmeDevState_NORMAL), + withState(proto.MockNvmeController(4), + ctlpb.NvmeDevState_NEW), + withState(proto.MockNvmeController(5), + ctlpb.NvmeDevState_EVICTED), + }, + State: new(ctlpb.ResponseState), + }, + }, "scan over drpc; with smd and health; missing ctrlr in smd": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: func() *ctlpb.SmdDevResp { @@ -250,6 +376,13 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { cs := newMockControlServiceFromBackends(t, log, sCfg, bmb, smb, nil, tc.engStopped) ei := cs.harness.Instances()[0].(*EngineInstance) + if tc.rank < 0 { + ei.setSuperblock(nil) + } else { + ei.setSuperblock(&Superblock{ + Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true, + }) + } resp, err := bdevScanEngine(test.Context(t), ei, &tc.req) test.CmpErr(t, tc.expErr, err) diff --git a/src/control/server/instance_superblock.go b/src/control/server/instance_superblock.go index 0d6ec613a8b3..c7eff0e100f5 100644 --- a/src/control/server/instance_superblock.go +++ b/src/control/server/instance_superblock.go @@ -85,11 +85,11 @@ func (ei *EngineInstance) hasSuperblock() bool { return ei.getSuperblock() != nil } -// NeedsSuperblock indicates whether or not the instance appears +// needsSuperblock indicates whether or not the instance appears // to need a superblock to be created in order to start. // // Should not be called if SCM format is required. -func (ei *EngineInstance) NeedsSuperblock() (bool, error) { +func (ei *EngineInstance) needsSuperblock() (bool, error) { if ei.hasSuperblock() { ei.log.Debugf("instance %d has no superblock set", ei.Index()) return false, nil @@ -116,7 +116,7 @@ func (ei *EngineInstance) createSuperblock() error { return errors.Errorf("can't create superblock: instance %d already started", ei.Index()) } - needsSuperblock, err := ei.NeedsSuperblock() // scm format completed by now + needsSuperblock, err := ei.needsSuperblock() // scm format completed by now if !needsSuperblock { return nil } diff --git a/src/control/server/instance_test.go b/src/control/server/instance_test.go index ea0882854677..2792c95faccb 100644 --- a/src/control/server/instance_test.go +++ b/src/control/server/instance_test.go @@ -284,3 +284,7 @@ func (mi *MockInstance) GetStorage() *storage.Provider { func (mi *MockInstance) Debugf(format string, args ...interface{}) { return } + +func (mi *MockInstance) Tracef(format string, args ...interface{}) { + return +} diff --git a/src/control/server/mgmt_cont_test.go b/src/control/server/mgmt_cont_test.go index 55efb4176494..f4096d7fd14c 100644 --- a/src/control/server/mgmt_cont_test.go +++ b/src/control/server/mgmt_cont_test.go @@ -24,7 +24,8 @@ import ( ) const ( - mockUUID = "11111111-1111-1111-1111-111111111111" + mockUUID = "11111111-1111-1111-1111-111111111111" + badMockUUID = "00000000-1111-1111-1111-111111111111" ) func makeBadBytes(count int) (badBytes []byte) { diff --git a/src/control/server/mgmt_pool.go b/src/control/server/mgmt_pool.go index ae1730fbf025..0afb80c5d913 100644 --- a/src/control/server/mgmt_pool.go +++ b/src/control/server/mgmt_pool.go @@ -642,7 +642,7 @@ func (svc *mgmtSvc) poolEvictConnections(ctx context.Context, req *mgmtpb.PoolDe evResp, err := svc.PoolEvict(ctx, evReq) if err != nil { - svc.log.Debugf("svc.PoolEvict failed\n") + svc.log.Errorf("svc.PoolEvict failed\n") return 0, err } @@ -699,7 +699,7 @@ func (svc *mgmtSvc) PoolDestroy(parent context.Context, req *mgmtpb.PoolDestroyR // Perform separate PoolEvict _before_ possible transition to destroying state. evStatus, err := svc.poolEvictConnections(ctx, req) - if err != nil { + if !req.Force && err != nil { return nil, err } @@ -715,8 +715,10 @@ func (svc *mgmtSvc) PoolDestroy(parent context.Context, req *mgmtpb.PoolDestroyR if evStatus != daos.Success { svc.log.Errorf("PoolEvict during pool destroy failed: %s", evStatus) - resp.Status = int32(evStatus) - return resp, nil + if !req.Force { + resp.Status = int32(evStatus) + return resp, nil + } } } diff --git a/src/control/server/mgmt_pool_test.go b/src/control/server/mgmt_pool_test.go index 10b34243bc95..db954672f81a 100644 --- a/src/control/server/mgmt_pool_test.go +++ b/src/control/server/mgmt_pool_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -719,37 +719,43 @@ func TestServer_MgmtSvc_PoolCreateDownRanks(t *testing.T) { func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) - missingSB := newTestMgmtSvc(t, log) - missingSB.harness.instances[0].(*EngineInstance)._superblock = nil - notAP := newTestMgmtSvc(t, log) - testPoolService := &system.PoolService{ - PoolLabel: "test-pool", - PoolUUID: uuid.MustParse(mockUUID), - Replicas: []ranklist.Rank{0, 1, 2}, - State: system.PoolServiceStateReady, - Storage: &system.PoolServiceStorage{ - CreationRankStr: ranklist.MustCreateRankSet("0-7").String(), - }, + creating := system.PoolServiceStateCreating + ready := system.PoolServiceStateReady + destroying := system.PoolServiceStateDestroying + testPoolService := func() *system.PoolService { + return &system.PoolService{ + PoolLabel: "test-pool", + PoolUUID: uuid.MustParse(mockUUID), + Replicas: []ranklist.Rank{0, 1, 2}, + State: ready, + Storage: &system.PoolServiceStorage{ + CreationRankStr: ranklist.MustCreateRankSet("0-7").String(), + }, + } } - svcWithState := func(in *system.PoolService, state system.PoolServiceState) (out *system.PoolService) { - out = new(system.PoolService) + curTestPoolSvc := new(system.PoolService) + svcWithState := func(in *system.PoolService, state system.PoolServiceState) *system.PoolService { + out := new(system.PoolService) *out = *in out.State = state - return + return out } - // Note: PoolDestroy will invoke one or two dRPCs (evict, evict+destroy) - // expDrpcEvReq is here for those cases in which just the evict dRPC is run + // Note: PoolDestroy will invoke up to three dRPCs (list-cont/evict/destroy). + // expDrpcListContReq/expDrpcEvReq/expDrpcReq verify the request used in the last of the + // calls. drpcResps specifies the list of responses sequentially returned over the dRPC + // channel. for name, tc := range map[string]struct { mgmtSvc *mgmtSvc - setupMockDrpc func(_ *mgmtSvc, _ error) - poolSvc *system.PoolService + poolSvcState *system.PoolServiceState // Initial state. req *mgmtpb.PoolDestroyReq + junkResp bool + drpcResps []*mockDrpcResponse // Sequential list of dRPC responses. expDrpcListContReq *mgmtpb.ListContReq expDrpcEvReq *mgmtpb.PoolEvictReq expDrpcReq *mgmtpb.PoolDestroyReq expResp *mgmtpb.PoolDestroyResp - expSvc *system.PoolService + expSvcState *system.PoolServiceState // Expected end state. expErr error }{ "nil request": { @@ -759,37 +765,96 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Sys: "bad"}, expErr: FaultWrongSystem("bad", build.DefaultSystemName), }, - "missing superblock": { - mgmtSvc: missingSB, - req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, - expErr: errors.New("not an access point"), + "missing uuid": { + req: &mgmtpb.PoolDestroyReq{}, + expErr: errors.New("empty pool id"), }, - "not access point": { - mgmtSvc: notAP, - req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, - expErr: errors.New("not an access point"), + "unknown uuid": { + req: &mgmtpb.PoolDestroyReq{Id: badMockUUID}, + expErr: errors.New("for a different pool"), }, - "dRPC send fails": { + "no dRPC response": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, - expErr: errors.New("send failure"), + expErr: errors.New("no response"), }, "garbage resp": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, + junkResp: true, + expErr: errors.New("unmarshal"), + }, + "error resp": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, - setupMockDrpc: func(svc *mgmtSvc, err error) { - // dRPC call returns junk in the message body - badBytes := makeBadBytes(42) - - setupMockDrpcClientBytes(svc, badBytes, err) + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{}, + Error: errors.New("not an access point"), + }, }, - expErr: errors.New("unmarshal"), + expDrpcListContReq: &mgmtpb.ListContReq{ + Sys: build.DefaultSystemName, + Id: mockUUID, + SvcRanks: []uint32{0, 1, 2}, + }, + expErr: errors.New("not an access point"), }, - "missing uuid": { - req: &mgmtpb.PoolDestroyReq{}, - expErr: errors.New("empty pool id"), + // Note: evict dRPC fails as no pool service alive, remains in creating state. + // getPoolService() returns TryAgain in resp before list-cont dRPC is issued. + "already creating, evict dRPC fails -DER_AGAIN, remains creating": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, + poolSvcState: &creating, + expResp: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.TryAgain), // Returned from list-cont call. + }, + expSvcState: &creating, + }, + // getPoolService() returns error before evict dRPC is issued. + "recursive=true, already creating, evict dRPC fails -DER_AGAIN, remains creating": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + poolSvcState: &creating, + expErr: daos.TryAgain, + expSvcState: &creating, + }, + // getPoolService() returns TryAgain during list-cont and evict but errors ignored. + "force=true, already creating, evict dRPC fails -DER_AGAIN, remains creating": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Force: true}, + poolSvcState: &creating, + expResp: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.TryAgain), // Returned from list-cont call. + }, + expSvcState: &creating, }, - // Note: evict dRPC fails, still expect a PoolDestroyResp from PoolDestroy - "evict dRPC fails with -DER_BUSY due to open handles force=false, pool still ready": { + // getPoolService() returns TryAgain during list-cont and evict but errors ignored. + "force=true, recursive=true, already creating, evict dRPC fails -DER_AGAIN, destroy succeeds": { + req: &mgmtpb.PoolDestroyReq{ + Id: mockUUID, + Force: true, + Recursive: true, + }, + poolSvcState: &creating, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, + }, + expDrpcReq: &mgmtpb.PoolDestroyReq{ + Sys: build.DefaultSystemName, + Id: mockUUID, + SvcRanks: []uint32{0, 1, 2, 3, 4, 5, 6, 7}, + Recursive: true, + Force: true, + }, + expResp: &mgmtpb.PoolDestroyResp{}, + }, + // Note: evict dRPC fails but because of Busy status, remains in ready state. + "recursive=true, evict dRPC fails -DER_BUSY due to open handles, pool still ready": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.Busy), + }, + }, + }, expDrpcEvReq: &mgmtpb.PoolEvictReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -797,18 +862,21 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { Destroy: true, ForceDestroy: false, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolEvictResp{ - Status: int32(daos.Busy), - }, nil) - }, expResp: &mgmtpb.PoolDestroyResp{ - Status: int32(daos.Busy), + Status: int32(daos.Busy), // Returned from evict call. }, - expSvc: testPoolService, + expSvcState: &ready, }, - "evict dRPC fails due to engine error": { + // Note: evict dRPC fails but because of NotService status, remains in ready state. + "recursive=true, evict dRPC fails -DER_NO_SERVICE due to open handles, pool still ready": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.NoService), + }, + }, + }, expDrpcEvReq: &mgmtpb.PoolEvictReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -816,76 +884,115 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { Destroy: true, ForceDestroy: false, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolEvictResp{ - Status: int32(daos.MiscError), - }, nil) - }, expResp: &mgmtpb.PoolDestroyResp{ - Status: int32(daos.MiscError), + Status: int32(daos.NoService), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &ready, }, - "force=true, evict dRPC fails due to engine error": { - req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Force: true, Recursive: true}, + "recursive=true, evict dRPC with engine error": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.MiscError), + }, + }, + }, expDrpcEvReq: &mgmtpb.PoolEvictReq{ Sys: build.DefaultSystemName, Id: mockUUID, SvcRanks: []uint32{0, 1, 2}, Destroy: true, - ForceDestroy: true, - }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolEvictResp{ - Status: int32(daos.MiscError), - }, nil) + ForceDestroy: false, }, expResp: &mgmtpb.PoolDestroyResp{ Status: int32(daos.MiscError), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &destroying, }, - "already destroying, destroy dRPC fails due to engine error": { - req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + "force=true, recursive=true, evict dRPC fails -DER_BUSY, pool in destroying state": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Force: true, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.Busy), + }, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.MiscError), + }, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, SvcRanks: []uint32{0, 1, 2, 3, 4, 5, 6, 7}, Recursive: true, + Force: true, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolDestroyResp{ - Status: int32(daos.MiscError), - }, nil) - }, - poolSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), expResp: &mgmtpb.PoolDestroyResp{ Status: int32(daos.MiscError), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &destroying, }, - "force=true already destroying, destroy dRPC fails due to engine error": { - req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Force: true, Recursive: true}, + // Initial destroying state means list-containers and evict calls are skipped. + "recursive=true, already destroying, destroy dRPC with engine error": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + poolSvcState: &destroying, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.MiscError), + }, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, SvcRanks: []uint32{0, 1, 2, 3, 4, 5, 6, 7}, Recursive: true, + }, + expResp: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.MiscError), + }, + expSvcState: &destroying, + }, + "force=true, recursive=true, already destroying, destroy dRPC with engine error": { + req: &mgmtpb.PoolDestroyReq{ + Id: mockUUID, Force: true, + Recursive: true, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolDestroyResp{ - Status: int32(daos.MiscError), - }, nil) + poolSvcState: &destroying, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{ + Status: int32(daos.MiscError), + }, + }, + }, + expDrpcReq: &mgmtpb.PoolDestroyReq{ + Sys: build.DefaultSystemName, + Id: mockUUID, + SvcRanks: []uint32{0, 1, 2, 3, 4, 5, 6, 7}, + Recursive: true, + Force: true, }, - poolSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), expResp: &mgmtpb.PoolDestroyResp{ Status: int32(daos.MiscError), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &destroying, }, - "evict dRPC fails with -DER_NOTLEADER on first try": { + "evict dRPC fails -DER_NOTLEADER on first try": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.NotLeader), + }, + }, + }, expDrpcEvReq: &mgmtpb.PoolEvictReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -893,18 +1000,20 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { Destroy: true, ForceDestroy: false, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolEvictResp{ - Status: int32(daos.NotLeader), - }, nil) - }, expResp: &mgmtpb.PoolDestroyResp{ Status: int32(daos.NotLeader), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &destroying, }, - "evict dRPC fails with -DER_NOTREPLICA on first try": { + "evict dRPC fails -DER_NOTREPLICA on first try": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{ + Status: int32(daos.NotReplica), + }, + }, + }, expDrpcEvReq: &mgmtpb.PoolEvictReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -912,18 +1021,19 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { Destroy: true, ForceDestroy: false, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.PoolEvictResp{ - Status: int32(daos.NotReplica), - }, nil) - }, expResp: &mgmtpb.PoolDestroyResp{ Status: int32(daos.NotReplica), }, - expSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), + expSvcState: &destroying, }, "already destroying, destroy dRPC succeeds": { - req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + poolSvcState: &destroying, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -931,11 +1041,20 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { Recursive: true, }, expResp: &mgmtpb.PoolDestroyResp{}, - poolSvc: svcWithState(testPoolService, system.PoolServiceStateDestroying), }, - // Note: PoolDestroy() is going to run both evict and destroy dRPCs each of which will succeed "successful destroy": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -946,6 +1065,23 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { }, "force=true, successful destroy": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Force: true, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{ + Containers: []*mgmtpb.ListContResp_Cont{ + { + Uuid: mockUUID, + }, + }, + }, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -957,42 +1093,78 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { }, "recursive=false, list containers fails": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Error: daos.MiscError, + }, + }, expDrpcListContReq: &mgmtpb.ListContReq{ Sys: build.DefaultSystemName, Id: mockUUID, SvcRanks: []uint32{0, 1, 2}, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.ListContResp{ - Status: int32(daos.MiscError), - }, nil) - }, - expResp: &mgmtpb.PoolDestroyResp{ - Status: int32(daos.MiscError), - }, - expSvc: testPoolService, + expErr: daos.MiscError, }, "recursive=false, containers exist; destroy refused": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{ + Containers: []*mgmtpb.ListContResp_Cont{ + { + Uuid: mockUUID, + }, + }, + }, + }, + }, expDrpcListContReq: &mgmtpb.ListContReq{ Sys: build.DefaultSystemName, Id: mockUUID, SvcRanks: []uint32{0, 1, 2}, }, - setupMockDrpc: func(svc *mgmtSvc, err error) { - setupMockDrpcClient(svc, &mgmtpb.ListContResp{ - Containers: []*mgmtpb.ListContResp_Cont{ - {Uuid: "56781234-5678-5678-5678-123456789abc"}, - {Uuid: "67812345-6781-6781-6781-123456789abc"}, - {Uuid: "78123456-7812-7812-7812-123456789abc"}, - {Uuid: "81234567-8123-8123-8123-123456789abc"}, + expErr: FaultPoolHasContainers, + }, + "recursive=true; list containers returns true; successful destroy": { + req: &mgmtpb.PoolDestroyReq{Id: mockUUID, Recursive: true}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{ + Containers: []*mgmtpb.ListContResp_Cont{ + { + Uuid: mockUUID, + }, + }, }, - }, nil) + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, }, - expErr: FaultPoolHasContainers, + expDrpcReq: &mgmtpb.PoolDestroyReq{ + Sys: build.DefaultSystemName, + Id: mockUUID, + SvcRanks: []uint32{0, 1, 2, 3, 4, 5, 6, 7}, + Recursive: true, + }, + expResp: &mgmtpb.PoolDestroyResp{}, }, "recursive=false, containers do not exist; successful destroy": { req: &mgmtpb.PoolDestroyReq{Id: mockUUID}, + drpcResps: []*mockDrpcResponse{ + &mockDrpcResponse{ + Message: &mgmtpb.ListContResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolEvictResp{}, + }, + &mockDrpcResponse{ + Message: &mgmtpb.PoolDestroyResp{}, + }, + }, expDrpcReq: &mgmtpb.PoolDestroyReq{ Sys: build.DefaultSystemName, Id: mockUUID, @@ -1007,70 +1179,55 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { buf.Reset() defer test.ShowBufferOnFailure(t, buf) - if tc.mgmtSvc == nil { - tc.mgmtSvc = newTestMgmtSvc(t, log) - } + mgmtSvc := newTestMgmtSvc(t, log) numMembers := 8 for i := 0; i < numMembers; i++ { - if _, err := tc.mgmtSvc.membership.Add(system.MockMember(t, uint32(i), system.MemberStateJoined)); err != nil { + if _, err := mgmtSvc.membership.Add(system.MockMember(t, + uint32(i), system.MemberStateJoined)); err != nil { t.Fatal(err) } } - poolSvc := tc.poolSvc - if poolSvc == nil { - poolSvc = testPoolService + curTestPoolSvc = testPoolService() + if tc.poolSvcState != nil { + curTestPoolSvc.State = *tc.poolSvcState } - lock, ctx := getPoolLockCtx(t, nil, tc.mgmtSvc.sysdb, poolSvc.PoolUUID) + + lock, ctx := getPoolLockCtx(t, nil, mgmtSvc.sysdb, curTestPoolSvc.PoolUUID) defer lock.Release() - if err := tc.mgmtSvc.sysdb.AddPoolService(ctx, poolSvc); err != nil { + if err := mgmtSvc.sysdb.AddPoolService(ctx, curTestPoolSvc); err != nil { t.Fatal(err) } - if tc.setupMockDrpc == nil { - tc.setupMockDrpc = func(svc *mgmtSvc, err error) { - setupMockDrpcClient(tc.mgmtSvc, tc.expResp, tc.expErr) + cfg := new(mockDrpcClientConfig) + if tc.junkResp { + cfg.setSendMsgResponse(drpc.Status_SUCCESS, makeBadBytes(42), nil) + } else { + for _, mock := range tc.drpcResps { + cfg.setSendMsgResponseList(t, mock) } } - tc.setupMockDrpc(tc.mgmtSvc, tc.expErr) + ei := mgmtSvc.harness.instances[0].(*EngineInstance) + ei.setDrpcClient(newMockDrpcClient(cfg)) if tc.req != nil && tc.req.Sys == "" { tc.req.Sys = build.DefaultSystemName } - gotResp, gotErr := tc.mgmtSvc.PoolDestroy(ctx, tc.req) + gotResp, gotErr := mgmtSvc.PoolDestroy(ctx, tc.req) test.CmpErr(t, tc.expErr, gotErr) - if tc.expErr != nil { - return - } cmpOpts := append( test.DefaultCmpOpts(), cmpopts.IgnoreTypes(system.PoolServiceStorage{}), cmpopts.IgnoreFields(system.PoolService{}, "LastUpdate"), ) - if diff := cmp.Diff(tc.expResp, gotResp, cmpOpts...); diff != "" { - t.Fatalf("unexpected response (-want, +got)\n%s\n", diff) - } - - gotSvc, err := tc.mgmtSvc.sysdb.FindPoolServiceByUUID(uuid.MustParse(mockUUID)) - if err != nil { - if tc.expSvc != nil || !system.IsPoolNotFound(err) { - t.Fatalf("unexpected error: %v", err) - } - } - if tc.expSvc == nil && gotSvc != nil { - t.Fatalf("expected pool to be destroyed, but found %+v", gotSvc) - } - if diff := cmp.Diff(tc.expSvc, gotSvc, cmpOpts...); diff != "" { - t.Fatalf("unexpected ending PS values (-want, +got)\n%s\n", diff) - } if tc.expDrpcReq != nil { gotReq := new(mgmtpb.PoolDestroyReq) - if err := proto.Unmarshal(getLastMockCall(tc.mgmtSvc).Body, gotReq); err != nil { + if err := proto.Unmarshal(getLastMockCall(mgmtSvc).Body, gotReq); err != nil { t.Fatal(err) } if diff := cmp.Diff(tc.expDrpcReq, gotReq, cmpOpts...); diff != "" { @@ -1079,7 +1236,7 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { } if tc.expDrpcEvReq != nil { gotReq := new(mgmtpb.PoolEvictReq) - if err := proto.Unmarshal(getLastMockCall(tc.mgmtSvc).Body, gotReq); err != nil { + if err := proto.Unmarshal(getLastMockCall(mgmtSvc).Body, gotReq); err != nil { t.Fatal(err) } if diff := cmp.Diff(tc.expDrpcEvReq, gotReq, cmpOpts...); diff != "" { @@ -1088,13 +1245,36 @@ func TestServer_MgmtSvc_PoolDestroy(t *testing.T) { } if tc.expDrpcListContReq != nil { gotReq := new(mgmtpb.ListContReq) - if err := proto.Unmarshal(getLastMockCall(tc.mgmtSvc).Body, gotReq); err != nil { + if err := proto.Unmarshal(getLastMockCall(mgmtSvc).Body, gotReq); err != nil { t.Fatal(err) } if diff := cmp.Diff(tc.expDrpcListContReq, gotReq, cmpOpts...); diff != "" { t.Fatalf("unexpected list cont dRPC call (-want, +got):\n%s\n", diff) } } + + if tc.expErr != nil { + return + } + + if diff := cmp.Diff(tc.expResp, gotResp, cmpOpts...); diff != "" { + t.Fatalf("unexpected response (-want, +got)\n%s\n", diff) + } + + gotSvc, err := mgmtSvc.sysdb.FindPoolServiceByUUID(uuid.MustParse(mockUUID)) + if err != nil { + if tc.expSvcState != nil || !system.IsPoolNotFound(err) { + t.Fatalf("unexpected error: %v", err) + } + } + if tc.expSvcState != nil { + expSvc := svcWithState(curTestPoolSvc, *tc.expSvcState) + if diff := cmp.Diff(expSvc, gotSvc, cmpOpts...); diff != "" { + t.Fatalf("unexpected ending PS values (-want, +got)\n%s\n", diff) + } + } else if gotSvc != nil { + t.Fatalf("expected pool to be destroyed, but found %+v", gotSvc) + } }) } } diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go index 596bb0c50a99..792ffa545414 100644 --- a/src/control/server/server_utils.go +++ b/src/control/server/server_utils.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -523,9 +523,11 @@ func registerEngineEventCallbacks(srv *server, engine *EngineInstance, allStarte engine.OnStorageReady(func(_ context.Context) error { srv.log.Debugf("engine %d: storage ready", engine.Index()) - // Attempt to remove unused hugepages, log error only. - if err := cleanEngineHugepages(srv); err != nil { - srv.log.Errorf(err.Error()) + if !srv.cfg.DisableHugepages { + // Attempt to remove unused hugepages, log error only. + if err := cleanEngineHugepages(srv); err != nil { + srv.log.Errorf(err.Error()) + } } // Retrieve up-to-date meminfo to check resource availability. diff --git a/src/control/server/storage/bdev.go b/src/control/server/storage/bdev.go index d69326b4aaa9..df51110913d3 100644 --- a/src/control/server/storage/bdev.go +++ b/src/control/server/storage/bdev.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -52,6 +52,7 @@ const ( ConfSetHotplugBusidRange = C.NVME_CONF_SET_HOTPLUG_RANGE ConfSetAccelProps = C.NVME_CONF_SET_ACCEL_PROPS ConfSetSpdkRpcServer = C.NVME_CONF_SET_SPDK_RPC_SERVER + ConfSetAutoFaultyProps = C.NVME_CONF_SET_AUTO_FAULTY ) // Acceleration related constants for engine setting and optional capabilities. @@ -124,13 +125,13 @@ func (nds *NvmeDevState) UnmarshalJSON(data []byte) error { // LedState represents the LED state of device. type LedState int32 -// LedState values representing the VMD LED state (see include/spdk/vmd.h). +// LedState values representing the VMD LED state (see src/proto/ctl/smd.proto). const ( - LedStateNormal LedState = iota + LedStateUnknown LedState = iota LedStateIdentify LedStateFaulty LedStateRebuild - LedStateUnknown + LedStateNormal ) func (vls LedState) String() string { @@ -389,7 +390,11 @@ type NvmeControllers []*NvmeController func (ncs NvmeControllers) String() string { var ss []string for _, c := range ncs { - ss = append(ss, c.PciAddr) + s := c.PciAddr + for _, sd := range c.SmdDevices { + s += fmt.Sprintf("-nsid%d-%s", sd.CtrlrNamespaceID, sd.Roles.String()) + } + ss = append(ss, s) } return strings.Join(ss, ", ") } @@ -545,6 +550,7 @@ type ( Hostname string AccelProps AccelProps SpdkRpcSrvProps SpdkRpcServer + AutoFaultyProps BdevAutoFaulty VMDEnabled bool ScannedBdevs NvmeControllers // VMD needs address mapping for backing devices. } diff --git a/src/control/server/storage/bdev/backend_class_test.go b/src/control/server/storage/bdev/backend_class_test.go index 7efc2cd5a0ba..464877513181 100644 --- a/src/control/server/storage/bdev/backend_class_test.go +++ b/src/control/server/storage/bdev/backend_class_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -107,25 +107,19 @@ func TestBackend_writeJSONFile(t *testing.T) { host, _ := os.Hostname() tests := map[string]struct { - confIn storage.TierConfig - enableVmd bool - enableHotplug bool - hotplugBusidRange string - accelEngine string - accelOptMask storage.AccelOptionBits - rpcSrvEnable bool - rpcSrvSockAddr string - expErr error - expOut string + confIn *engine.Config + enableVmd bool + expErr error + expOut string }{ "nvme; single ssds": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1)...), }, - }, + }), expOut: ` { "daos_data": { @@ -174,7 +168,7 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; multiple ssds": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ @@ -183,7 +177,7 @@ func TestBackend_writeJSONFile(t *testing.T) { OptionBits: storage.OptionBits(storage.BdevRoleAll), }, }, - }, + }), expOut: ` { "daos_data": { @@ -240,14 +234,14 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; multiple ssds; vmd enabled; bus-id range": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1, 2)...), BusidRange: storage.MustNewBdevBusRange("0x80-0x8f"), }, - }, + }), enableVmd: true, expOut: ` { @@ -314,15 +308,14 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; multiple ssds; hotplug enabled; bus-id range": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1, 2)...), BusidRange: storage.MustNewBdevBusRange("0x80-0x8f"), }, - }, - enableHotplug: true, + }).WithStorageEnableHotplug(true), expOut: ` { "daos_data": { @@ -387,15 +380,14 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; multiple ssds; vmd and hotplug enabled": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1, 2)...), }, - }, - enableHotplug: true, - enableVmd: true, + }).WithStorageEnableHotplug(true), + enableVmd: true, expOut: ` { "daos_data": { @@ -469,16 +461,15 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; single controller; acceleration set to none; move and crc opts specified": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1)...), }, - }, - // Verify default "none" acceleration setting is ignored. - accelEngine: storage.AccelEngineNone, - accelOptMask: storage.AccelOptCRCFlag | storage.AccelOptMoveFlag, + // Verify default "none" acceleration setting is ignored. + }).WithStorageAccelProps(storage.AccelEngineNone, + storage.AccelOptCRCFlag|storage.AccelOptMoveFlag), expOut: ` { "daos_data": { @@ -527,15 +518,70 @@ func TestBackend_writeJSONFile(t *testing.T) { `, }, "nvme; single controller; acceleration set to spdk; no opts specified": { - confIn: storage.TierConfig{ + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ + Tier: tierID, + Class: storage.ClassNvme, + Bdev: storage.BdevConfig{ + DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1)...), + }, + // Verify default "spdk" acceleration setting with no enable options is ignored. + }).WithStorageAccelProps(storage.AccelEngineSPDK, 0), + expOut: ` +{ + "daos_data": { + "config": [] + }, + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + { + "params": { + "bdev_io_pool_size": 65536, + "bdev_io_cache_size": 256 + }, + "method": "bdev_set_options" + }, + { + "params": { + "retry_count": 4, + "timeout_us": 0, + "nvme_adminq_poll_period_us": 100000, + "action_on_timeout": "none", + "nvme_ioq_poll_period_us": 0 + }, + "method": "bdev_nvme_set_options" + }, + { + "params": { + "enable": false, + "period_us": 0 + }, + "method": "bdev_nvme_set_hotplug" + }, + { + "params": { + "trtype": "PCIe", + "name": "Nvme_hostfoo_0_84_0", + "traddr": "0000:01:00.0" + }, + "method": "bdev_nvme_attach_controller" + } + ] + } + ] +} +`, + }, + "nvme; single controller; auto faulty disabled but criteria set": { + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1)...), }, - }, - // Verify default "spdk" acceleration setting with no enable options is ignored. - accelEngine: storage.AccelEngineSPDK, + // Verify "false" auto faulty setting is ignored. + }).WithStorageAutoFaultyCriteria(false, 100, 200), expOut: ` { "daos_data": { @@ -583,18 +629,18 @@ func TestBackend_writeJSONFile(t *testing.T) { } `, }, - "nvme; single controller; accel set with opts; rpc srv set": { - confIn: storage.TierConfig{ + "nvme; single controller; accel set with opts; rpc srv set; auto faulty criteria": { + confIn: engine.MockConfig().WithStorage(&storage.TierConfig{ Tier: tierID, Class: storage.ClassNvme, Bdev: storage.BdevConfig{ DeviceList: storage.MustNewBdevDeviceList(test.MockPCIAddrs(1)...), }, - }, - accelEngine: storage.AccelEngineSPDK, - accelOptMask: storage.AccelOptCRCFlag | storage.AccelOptMoveFlag, - rpcSrvEnable: true, - rpcSrvSockAddr: "/tmp/spdk.sock", + }). + WithStorageAccelProps(storage.AccelEngineSPDK, + storage.AccelOptCRCFlag|storage.AccelOptMoveFlag). + WithStorageSpdkRpcSrvProps(true, "/tmp/spdk.sock"). + WithStorageAutoFaultyCriteria(true, 100, 200), expOut: ` { "daos_data": { @@ -612,6 +658,14 @@ func TestBackend_writeJSONFile(t *testing.T) { "sock_addr": "/tmp/spdk.sock" }, "method": "spdk_rpc_srv" + }, + { + "params": { + "enable": true, + "max_io_errs": 100, + "max_csum_errs": 200 + }, + "method": "auto_faulty" } ] }, @@ -668,24 +722,10 @@ func TestBackend_writeJSONFile(t *testing.T) { defer clean() cfgOutputPath := filepath.Join(testDir, "outfile") - engineConfig := engine.MockConfig(). - WithFabricProvider("test"). // valid enough to pass "not-blank" test - WithFabricInterface("test"). - WithFabricInterfacePort(42). - WithStorage( - storage.NewTierConfig(). - WithStorageClass("dcpm"). - WithScmDeviceList("foo"). - WithScmMountPoint("scmmnt"), - &tc.confIn, - ). - WithStorageConfigOutputPath(cfgOutputPath). - WithStorageEnableHotplug(tc.enableHotplug). - WithStorageAccelProps(tc.accelEngine, tc.accelOptMask). - WithStorageSpdkRpcSrvProps(tc.rpcSrvEnable, tc.rpcSrvSockAddr) req, err := storage.BdevWriteConfigRequestFromConfig(test.Context(t), log, - &engineConfig.Storage, tc.enableVmd, storage.MockGetTopology) + &(tc.confIn.WithStorageConfigOutputPath(cfgOutputPath)).Storage, + tc.enableVmd, storage.MockGetTopology) if err != nil { t.Fatal(err) } diff --git a/src/control/server/storage/bdev/backend_json.go b/src/control/server/storage/bdev/backend_json.go index 7be0328e67bb..b68cfe56c9a7 100644 --- a/src/control/server/storage/bdev/backend_json.go +++ b/src/control/server/storage/bdev/backend_json.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -40,7 +40,7 @@ type SetOptionsParams struct { BdevIoCacheSize uint64 `json:"bdev_io_cache_size"` } -func (sop SetOptionsParams) isSpdkSubsystemConfigParams() {} +func (_ SetOptionsParams) isSpdkSubsystemConfigParams() {} // NvmeSetOptionsParams specifies details for a storage.ConfBdevNvmeSetOptions method. type NvmeSetOptionsParams struct { @@ -51,7 +51,7 @@ type NvmeSetOptionsParams struct { NvmeIoqPollPeriodUsec uint32 `json:"nvme_ioq_poll_period_us"` } -func (nsop NvmeSetOptionsParams) isSpdkSubsystemConfigParams() {} +func (_ NvmeSetOptionsParams) isSpdkSubsystemConfigParams() {} // NvmeAttachControllerParams specifies details for a storage.ConfBdevNvmeAttachController // method. @@ -61,7 +61,7 @@ type NvmeAttachControllerParams struct { TransportAddress string `json:"traddr"` } -func (napp NvmeAttachControllerParams) isSpdkSubsystemConfigParams() {} +func (_ NvmeAttachControllerParams) isSpdkSubsystemConfigParams() {} // NvmeSetHotplugParams specifies details for a storage.ConfBdevNvmeSetHotplug method. type NvmeSetHotplugParams struct { @@ -69,7 +69,7 @@ type NvmeSetHotplugParams struct { PeriodUsec uint64 `json:"period_us"` } -func (nshp NvmeSetHotplugParams) isSpdkSubsystemConfigParams() {} +func (_ NvmeSetHotplugParams) isSpdkSubsystemConfigParams() {} // VmdEnableParams specifies details for a storage.ConfVmdEnable method. type VmdEnableParams struct{} @@ -83,7 +83,7 @@ type AioCreateParams struct { Filename string `json:"filename"` } -func (acp AioCreateParams) isSpdkSubsystemConfigParams() {} +func (_ AioCreateParams) isSpdkSubsystemConfigParams() {} // HotplugBusidRangeParams specifies details for a storage.ConfSetHotplugBusidRange method. type HotplugBusidRangeParams struct { @@ -91,17 +91,22 @@ type HotplugBusidRangeParams struct { End uint8 `json:"end"` } -func (hbrp HotplugBusidRangeParams) isDaosConfigParams() {} +func (_ HotplugBusidRangeParams) isDaosConfigParams() {} // AccelPropsParams specifies details for a storage.ConfSetAccelProps method. type AccelPropsParams storage.AccelProps -func (app AccelPropsParams) isDaosConfigParams() {} +func (_ AccelPropsParams) isDaosConfigParams() {} // SpdkRpcServerParams specifies details for a storage.ConfSetSpdkRpcServer method. type SpdkRpcServerParams storage.SpdkRpcServer -func (srsp SpdkRpcServerParams) isDaosConfigParams() {} +func (_ SpdkRpcServerParams) isDaosConfigParams() {} + +// AutoFaultyParams specifies details for a storage.ConfSetAutoFaultyProp method. +type AutoFaultyParams storage.BdevAutoFaulty + +func (_ AutoFaultyParams) isDaosConfigParams() {} // SpdkSubsystemConfig entries apply to any SpdkSubsystem. type SpdkSubsystemConfig struct { @@ -301,6 +306,17 @@ func rpcSrvSet(req *storage.BdevWriteConfigRequest, data *DaosData) { } } +// Add NVMe auto-faulty settings to DAOS config data. +func autoFaultySet(req *storage.BdevWriteConfigRequest, data *DaosData) { + props := req.AutoFaultyProps + if props.Enable { + data.Configs = append(data.Configs, &DaosConfig{ + Method: storage.ConfSetAutoFaultyProps, + Params: AutoFaultyParams(props), + }) + } +} + func newSpdkConfig(log logging.Logger, req *storage.BdevWriteConfigRequest) (*SpdkConfig, error) { sc := defaultSpdkConfig() @@ -338,6 +354,7 @@ func newSpdkConfig(log logging.Logger, req *storage.BdevWriteConfigRequest) (*Sp accelPropSet(req, sc.DaosData) rpcSrvSet(req, sc.DaosData) + autoFaultySet(req, sc.DaosData) return sc.WithBdevConfigs(log, req), nil } diff --git a/src/control/server/storage/bdev/backend_json_test.go b/src/control/server/storage/bdev/backend_json_test.go index d711103dc60a..504d81b9beec 100644 --- a/src/control/server/storage/bdev/backend_json_test.go +++ b/src/control/server/storage/bdev/backend_json_test.go @@ -83,6 +83,9 @@ func TestBackend_newSpdkConfig(t *testing.T) { accelOptMask storage.AccelOptionBits rpcSrvEnable bool rpcSrvSockAddr string + autoFaultyEnable bool + autoFaultyIO uint32 + autoFaultyCsum uint32 expExtraSubsystems []*SpdkSubsystem expBdevCfgs []*SpdkSubsystemConfig expDaosCfgs []*DaosConfig @@ -185,14 +188,17 @@ func TestBackend_newSpdkConfig(t *testing.T) { }...), vosEnv: "AIO", }, - "multiple controllers; accel & rpc server settings": { - class: storage.ClassNvme, - devList: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, - accelEngine: storage.AccelEngineSPDK, - accelOptMask: storage.AccelOptCRCFlag | storage.AccelOptMoveFlag, - rpcSrvEnable: true, - rpcSrvSockAddr: "/tmp/spdk.sock", - expBdevCfgs: multiCtrlrConfs(), + "multiple controllers; accel, rpc server & auto faulty settings": { + class: storage.ClassNvme, + devList: []string{test.MockPCIAddr(1), test.MockPCIAddr(2)}, + accelEngine: storage.AccelEngineSPDK, + accelOptMask: storage.AccelOptCRCFlag | storage.AccelOptMoveFlag, + rpcSrvEnable: true, + rpcSrvSockAddr: "/tmp/spdk.sock", + autoFaultyEnable: true, + autoFaultyIO: 100, + autoFaultyCsum: 200, + expBdevCfgs: multiCtrlrConfs(), expDaosCfgs: []*DaosConfig{ { Method: storage.ConfSetAccelProps, @@ -208,6 +214,14 @@ func TestBackend_newSpdkConfig(t *testing.T) { SockAddr: "/tmp/spdk.sock", }, }, + { + Method: storage.ConfSetAutoFaultyProps, + Params: AutoFaultyParams{ + Enable: true, + MaxIoErrs: 100, + MaxCsumErrs: 200, + }, + }, }, }, } @@ -251,7 +265,9 @@ func TestBackend_newSpdkConfig(t *testing.T) { WithTargetCount(8). WithPinnedNumaNode(0). WithStorageAccelProps(tc.accelEngine, tc.accelOptMask). - WithStorageSpdkRpcSrvProps(tc.rpcSrvEnable, tc.rpcSrvSockAddr) + WithStorageSpdkRpcSrvProps(tc.rpcSrvEnable, tc.rpcSrvSockAddr). + WithStorageAutoFaultyCriteria(tc.autoFaultyEnable, tc.autoFaultyIO, + tc.autoFaultyCsum) if tc.devRoles != 0 { engineConfig.Storage.ControlMetadata = storage.ControlMetadata{ diff --git a/src/control/server/storage/config.go b/src/control/server/storage/config.go index feab22be6cf0..ac3ac8da6769 100644 --- a/src/control/server/storage/config.go +++ b/src/control/server/storage/config.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -43,6 +43,7 @@ const ( accelOptMoveName = "move" accelOptCRCName = "crc" + bdevRoleNoneName = "na" bdevRoleDataName = "data" bdevRoleMetaName = "meta" bdevRoleWALName = "wal" @@ -252,10 +253,18 @@ func (tcs TierConfigs) getBdevs(nvmeOnly bool) *BdevDeviceList { } func (tcs TierConfigs) Bdevs() *BdevDeviceList { + if len(tcs) == 0 { + return new(BdevDeviceList) + } + return tcs.getBdevs(false) } func (tcs TierConfigs) NVMeBdevs() *BdevDeviceList { + if len(tcs) == 0 { + return new(BdevDeviceList) + } + return tcs.getBdevs(true) } @@ -281,18 +290,34 @@ func (tcs TierConfigs) checkBdevs(nvmeOnly, emulOnly bool) bool { } func (tcs TierConfigs) HaveBdevs() bool { + if len(tcs) == 0 { + return false + } + return tcs.checkBdevs(false, false) } func (tcs TierConfigs) HaveRealNVMe() bool { + if len(tcs) == 0 { + return false + } + return tcs.checkBdevs(true, false) } func (tcs TierConfigs) HaveEmulatedNVMe() bool { + if len(tcs) == 0 { + return false + } + return tcs.checkBdevs(false, true) } func (tcs TierConfigs) HasBdevRoleMeta() bool { + if len(tcs) == 0 { + return false + } + for _, bc := range tcs.BdevConfigs() { bits := bc.Bdev.DeviceRoles.OptionBits if (bits & BdevRoleMeta) != 0 { @@ -442,6 +467,10 @@ func (tcs TierConfigs) validateBdevRoles() error { // - If the scm tier is of class dcpm, the first (and only) bdev tier should have the Data role. // - If emulated NVMe is present in bdev tiers, implicit role assignment is skipped. func (tcs TierConfigs) AssignBdevTierRoles(extMetadataPath string) error { + if len(tcs) == 0 { + return errors.New("no storage tiers configured") + } + if extMetadataPath == "" { return nil // MD-on-SSD not enabled. } @@ -846,6 +875,9 @@ func (obs *OptionBits) fromStrings(optStr2Flag optFlagMap, opts ...string) error if len(opt) == 0 { continue } + if strings.ToLower(opt) == bdevRoleNoneName { + break + } flag, exists := optStr2Flag[opt] if !exists { return FaultBdevConfigOptFlagUnknown(opt, optStr2Flag.keys()...) @@ -893,13 +925,19 @@ func (bdr BdevRoles) MarshalJSON() ([]byte, error) { // UnmarshalJSON decodes user readable roles string into bitmask. func (bdr *BdevRoles) UnmarshalJSON(data []byte) error { str := strings.Trim(strings.ToLower(string(data)), "\"") + if str == bdevRoleNoneName { + bdr.OptionBits = OptionBits(0) + return nil + } + return bdr.fromStrings(roleOptFlags, strings.Split(str, ",")...) } func (bdr *BdevRoles) String() string { - if bdr == nil { - return "none" + if bdr == nil || bdr.IsEmpty() { + return strings.ToUpper(bdevRoleNoneName) } + return bdr.toString(roleOptFlags) } @@ -1060,6 +1098,14 @@ type SpdkRpcServer struct { SockAddr string `yaml:"sock_addr,omitempty" json:"sock_addr"` } +// BdevAutoFaulty struct describes settings for detection of faulty NVMe devices within the BIO +// module of the engine process. +type BdevAutoFaulty struct { + Enable bool `yaml:"enable,omitempty" json:"enable"` + MaxIoErrs uint32 `yaml:"max_io_errs,omitempty" json:"max_io_errs"` + MaxCsumErrs uint32 `yaml:"max_csum_errs,omitempty" json:"max_csum_errs"` +} + type Config struct { ControlMetadata ControlMetadata `yaml:"-"` // inherited from server EngineIdx uint `yaml:"-"` @@ -1070,6 +1116,7 @@ type Config struct { NumaNodeIndex uint `yaml:"-"` AccelProps AccelProps `yaml:"acceleration,omitempty"` SpdkRpcSrvProps SpdkRpcServer `yaml:"spdk_rpc_server,omitempty"` + AutoFaultyProps BdevAutoFaulty `yaml:"bdev_auto_faulty,omitempty"` } func (c *Config) SetNUMAAffinity(node uint) { diff --git a/src/control/server/storage/config_test.go b/src/control/server/storage/config_test.go index be4ddf8efc59..f595aaa4ffac 100644 --- a/src/control/server/storage/config_test.go +++ b/src/control/server/storage/config_test.go @@ -775,6 +775,29 @@ storage: } } +func TestStorage_BdevDeviceRoles_String(t *testing.T) { + for name, tc := range map[string]struct { + bits OptionBits + expOut string + }{ + "empty": { + bits: OptionBits(0), + expOut: "NA", + }, + "all": { + bits: OptionBits(BdevRoleAll), + expOut: "data,meta,wal", + }, + } { + t.Run(name, func(t *testing.T) { + bdr := BdevRoles{OptionBits: tc.bits} + if diff := cmp.Diff(bdr.String(), tc.expOut); diff != "" { + t.Fatalf("bad output (-want +got):\n%s", diff) + } + }) + } +} + func TestStorage_AccelProps_FromYAML(t *testing.T) { for name, tc := range map[string]struct { input string diff --git a/src/control/server/storage/provider.go b/src/control/server/storage/provider.go index 5d99ae38e9bc..a8cad7e861b3 100644 --- a/src/control/server/storage/provider.go +++ b/src/control/server/storage/provider.go @@ -471,9 +471,10 @@ func BdevFormatRequestFromConfig(log logging.Logger, cfg *TierConfig) (BdevForma // BdevTierFormatResult contains details of a format operation result. type BdevTierFormatResult struct { - Tier int - Error error - Result *BdevFormatResponse + Tier int + DeviceRoles BdevRoles + Error error + Result *BdevFormatResponse } // FormatBdevTiers formats all the Bdev tiers in the engine storage @@ -505,6 +506,7 @@ func (p *Provider) FormatBdevTiers(ctrlrs NvmeControllers) (results []BdevTierFo p.RUnlock() results[i].Tier = cfg.Tier + results[i].DeviceRoles = cfg.Bdev.DeviceRoles if err := results[i].Error; err != nil { p.log.Errorf("Instance %d: format failed (%s)", err) continue @@ -573,6 +575,7 @@ func BdevWriteConfigRequestFromConfig(ctx context.Context, log logging.Logger, c TierProps: []BdevTierProperties{}, AccelProps: cfg.AccelProps, SpdkRpcSrvProps: cfg.SpdkRpcSrvProps, + AutoFaultyProps: cfg.AutoFaultyProps, } for idx, tier := range cfg.Tiers.BdevConfigs() { diff --git a/src/control/server/storage/provider_test.go b/src/control/server/storage/provider_test.go index 688563161f19..e2dd9c2df67e 100644 --- a/src/control/server/storage/provider_test.go +++ b/src/control/server/storage/provider_test.go @@ -222,6 +222,33 @@ func Test_BdevWriteRequestFromConfig(t *testing.T) { }, }, }, + "auto faulty criteria applied": { + cfg: &Config{ + Tiers: TierConfigs{ + mockScmTier, + NewTierConfig().WithStorageClass(ClassNvme.String()), + }, + AutoFaultyProps: BdevAutoFaulty{ + Enable: true, + MaxIoErrs: 10000, + MaxCsumErrs: 20000, + }, + }, + getTopoFn: MockGetTopology, + expReq: &BdevWriteConfigRequest{ + OwnerUID: os.Geteuid(), + OwnerGID: os.Getegid(), + TierProps: []BdevTierProperties{ + {Class: ClassNvme}, + }, + Hostname: hostname, + AutoFaultyProps: BdevAutoFaulty{ + Enable: true, + MaxIoErrs: 10000, + MaxCsumErrs: 20000, + }, + }, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(name) diff --git a/src/control/system/raft/database.go b/src/control/system/raft/database.go index 2849edb8c087..9db013aa26de 100644 --- a/src/control/system/raft/database.go +++ b/src/control/system/raft/database.go @@ -1055,7 +1055,7 @@ func (db *Database) handlePoolRepsUpdate(evt *events.RASEvent) { ctx := context.Background() lock, err := db.TakePoolLock(ctx, poolUUID) if err != nil { - db.log.Errorf("failed to take lock for pool svc update: %s", err) + db.log.Noticef("failed to take lock for pool svc update: %s", err) return } defer lock.Release() diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 0a3d2b193a75..98c942f72f32 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1634,8 +1634,8 @@ dtx_reindex_ult(void *arg) struct dss_module_info *dmi = dss_get_module_info(); int rc = 0; - D_INFO(DF_CONT": starting DTX reindex ULT on xstream %d, ver %u\n", - DP_CONT(NULL, cont->sc_uuid), dmi->dmi_tgt_id, dtx_cont2ver(cont)); + D_DEBUG(DB_MD, DF_CONT": starting DTX reindex ULT on xstream %d, ver %u\n", + DP_CONT(NULL, cont->sc_uuid), dmi->dmi_tgt_id, dtx_cont2ver(cont)); while (!cont->sc_dtx_reindex_abort && !dss_xstream_exiting(dmi->dmi_xstream)) { rc = vos_dtx_cmt_reindex(cont->sc_hdl); @@ -1645,7 +1645,7 @@ dtx_reindex_ult(void *arg) ABT_thread_yield(); } - D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, + D_CDEBUG(rc < 0, DLOG_ERR, DLOG_DBG, DF_CONT": stopping DTX reindex ULT on stream %d, ver %u: rc = %d\n", DP_CONT(NULL, cont->sc_uuid), dmi->dmi_tgt_id, dtx_cont2ver(cont), rc); diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 4a7661a51674..14f7f11b44ee 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -606,8 +606,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b return rc; } - D_INFO("Enter DTX resync for "DF_UUID"/"DF_UUID" with version: %u\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + D_DEBUG(DB_MD, "Enter DTX resync (%s) for "DF_UUID"/"DF_UUID" with ver %u\n", + block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver); crt_group_rank(NULL, &myrank); @@ -619,8 +619,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b if (target->ta_comp.co_status == PO_COMP_ST_UP) { dra.discard_version = target->ta_comp.co_in_ver; - D_INFO("DTX resync for "DF_UUID"/"DF_UUID" discard version: %u\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), dra.discard_version); + D_DEBUG(DB_MD, "DTX resync for "DF_UUID"/"DF_UUID" discard version: %u\n", + DP_UUID(po_uuid), DP_UUID(co_uuid), dra.discard_version); } ABT_rwlock_unlock(pool->sp_lock); @@ -675,8 +675,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b } } - D_INFO("Start DTX resync scan for "DF_UUID"/"DF_UUID" with version %u\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + D_DEBUG(DB_MD, "Start DTX resync (%s) scan for "DF_UUID"/"DF_UUID" with ver %u\n", + block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver); rc = ds_cont_iter(po_hdl, co_uuid, dtx_iter_cb, &dra, VOS_ITER_DTX, 0); @@ -690,8 +690,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b if (rc >= 0) rc = rc1; - D_INFO("Stop DTX resync scan for "DF_UUID"/"DF_UUID" with version %u: rc = %d\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); + D_DEBUG(DB_MD, "Stop DTX resync (%s) scan for "DF_UUID"/"DF_UUID" with ver %u: rc = %d\n", + block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); fail: ABT_mutex_lock(cont->sc_mutex); @@ -703,8 +703,8 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b if (!dtx_cont_opened(cont)) stop_dtx_reindex_ult(cont); - D_INFO("Exit DTX resync for "DF_UUID"/"DF_UUID" with version: %u\n", - DP_UUID(po_uuid), DP_UUID(co_uuid), ver); + D_DEBUG(DB_MD, "Exit DTX resync (%s) for "DF_UUID"/"DF_UUID" with ver %u, rc = %d\n", + block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); ds_cont_child_put(cont); return rc > 0 ? 0 : rc; diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index fdcda4abd3fc..e9a34afa31fd 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1101,8 +1101,8 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (rc1 != DTX_ST_COMMITTED && rc1 != DTX_ST_ABORTED && rc1 != -DER_NONEXIST) { if (!for_io) - D_INFO("Hit some long-time DTX "DF_DTI", %d\n", - DP_DTI(&dsp->dsp_xid), rc1); + D_WARN("Hit unexpected long-time DTX " + DF_DTI": %d\n", DP_DTI(&dsp->dsp_xid), rc1); else if (rc == 0) rc = -DER_INPROGRESS; } diff --git a/src/engine/init.c b/src/engine/init.c index 233798787009..0f705e5366b2 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -432,14 +432,15 @@ dss_init_state_set(enum dss_init_state state) static int abt_max_num_xstreams(void) { - char *env; + unsigned num_xstreams = 0; - env = getenv("ABT_MAX_NUM_XSTREAMS"); - if (env == NULL) - env = getenv("ABT_ENV_MAX_NUM_XSTREAMS"); - if (env != NULL) - return atoi(env); - return 0; + if (d_isenv_def("ABT_MAX_NUM_XSTREAMS")) + d_getenv_uint("ABT_MAX_NUM_XSTREAMS", &num_xstreams); + else + d_getenv_uint("ABT_ENV_MAX_NUM_XSTREAMS", &num_xstreams); + D_ASSERT(num_xstreams <= INT_MAX); + + return num_xstreams; } static int diff --git a/src/engine/sched.c b/src/engine/sched.c index 73bb142488bb..70b37d9d4be8 100644 --- a/src/engine/sched.c +++ b/src/engine/sched.c @@ -2095,7 +2095,9 @@ watchdog_enabled(struct dss_xstream *dx) if (sched_unit_runtime_max == 0) return false; - return dx->dx_xs_id == 0 || (sched_watchdog_all && dx->dx_main_xs); + /* Enable watchdog for system and swim xstream by default. */ + return dx->dx_xs_id == 0 || dx->dx_xs_id == 1 || + (sched_watchdog_all && dx->dx_xs_id != 2); } int diff --git a/src/engine/server_iv.c b/src/engine/server_iv.c index a7d258705a37..5f5d00722cc5 100644 --- a/src/engine/server_iv.c +++ b/src/engine/server_iv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1053,7 +1053,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, retry: rc = iv_op_internal(ns, key, value, sync, shortcut, opc); if (retry && !ns->iv_stop && - (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER)) { + (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY)) { if (rc == -DER_NOTLEADER && key->rank != (d_rank_t)(-1) && sync && (sync->ivs_mode == CRT_IV_SYNC_LAZY || sync->ivs_mode == CRT_IV_SYNC_EAGER)) { @@ -1070,7 +1070,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, * but in-flight fetch request return IVCB_FORWARD, then queued RPC will * reply IVCB_FORWARD. */ - D_WARN("ns %u retry for class %d opc %d rank %u/%u: " DF_RC "\n", ns->iv_ns_id, + D_INFO("ns %u retry for class %d opc %d rank %u/%u: " DF_RC "\n", ns->iv_ns_id, key->class_id, opc, key->rank, ns->iv_master_rank, DP_RC(rc)); /* sleep 1sec and retry */ dss_sleep(1000); diff --git a/src/engine/srv.c b/src/engine/srv.c index df0733ed6383..0afc13861d15 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -1048,13 +1048,14 @@ dss_xstreams_init(void) sched_relax_intvl); } - env = getenv("DAOS_SCHED_RELAX_MODE"); + d_agetenv_str(&env, "DAOS_SCHED_RELAX_MODE"); if (env) { sched_relax_mode = sched_relax_str2mode(env); if (sched_relax_mode == SCHED_RELAX_MODE_INVALID) { D_WARN("Invalid relax mode [%s]\n", env); sched_relax_mode = SCHED_RELAX_MODE_NET; } + d_freeenv_str(&env); } D_INFO("CPU relax mode is set to [%s]\n", sched_relax_mode2str(sched_relax_mode)); diff --git a/src/gurt/debug.c b/src/gurt/debug.c index 4fb112b7dcc4..bc672c03067e 100644 --- a/src/gurt/debug.c +++ b/src/gurt/debug.c @@ -380,7 +380,7 @@ debug_prio_err_load_env(void) char *env; int i; - env = getenv(DD_STDERR_ENV); + d_agetenv_str(&env, DD_STDERR_ENV); if (env == NULL) return; @@ -395,6 +395,7 @@ debug_prio_err_load_env(void) /* invalid DD_STDERR option */ if (d_dbglog_data.dd_prio_err == 0) D_PRINT_ERR("DD_STDERR = %s - invalid option\n", env); + d_freeenv_str(&env); } void @@ -415,7 +416,16 @@ d_log_sync_mask_ex(const char *log_mask, const char *dd_mask) void d_log_sync_mask(void) { - d_log_sync_mask_ex(getenv(D_LOG_MASK_ENV), getenv(DD_MASK_ENV)); + char *log_mask; + char *dd_mask; + + d_agetenv_str(&log_mask, D_LOG_MASK_ENV); + d_agetenv_str(&dd_mask, DD_MASK_ENV); + + d_log_sync_mask_ex(log_mask, dd_mask); + + d_freeenv_str(&dd_mask); + d_freeenv_str(&log_mask); } /** @@ -540,14 +550,15 @@ d_log_init(void) int flags = DLOG_FLV_LOGPID | DLOG_FLV_FAC | DLOG_FLV_TAG; int rc; - log_file = getenv(D_LOG_FILE_ENV); + d_agetenv_str(&log_file, D_LOG_FILE_ENV); if (log_file == NULL || strlen(log_file) == 0) { flags |= DLOG_FLV_STDOUT; - log_file = NULL; + d_freeenv_str(&log_file); } rc = d_log_init_adv("CaRT", log_file, flags, DLOG_WARN, DLOG_EMERG, NULL); + d_freeenv_str(&log_file); if (rc != DER_SUCCESS) { D_PRINT_ERR("d_log_init_adv failed, rc: %d.\n", rc); D_GOTO(out, rc); diff --git a/src/gurt/dlog.c b/src/gurt/dlog.c index 7cbce2fa7b6e..2f1324463c08 100644 --- a/src/gurt/dlog.c +++ b/src/gurt/dlog.c @@ -847,20 +847,20 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, if (pri != -1) mst.flush_pri = pri; - d_free_env_str(&env); + d_freeenv_str(&env); } d_agetenv_str(&env, D_LOG_TRUNCATE_ENV); if (env != NULL && atoi(env) > 0) truncate = 1; - d_free_env_str(&env); + d_freeenv_str(&env); d_agetenv_str(&env, D_LOG_SIZE_ENV); if (env != NULL) { log_size = d_getenv_size(env); if (log_size < LOG_SIZE_MIN) log_size = LOG_SIZE_MIN; - d_free_env_str(&env); + d_freeenv_str(&env); } d_agetenv_str(&env, D_LOG_FILE_APPEND_PID_ENV); @@ -875,12 +875,12 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, "continuing.\n"); } } - d_free_env_str(&env); + d_freeenv_str(&env); d_agetenv_str(&env, D_LOG_FILE_APPEND_RANK_ENV); if (env && strcmp(env, "0") != 0) mst.append_rank = true; - d_free_env_str(&env); + d_freeenv_str(&env); /* quick sanity check (mst.tag is non-null if already open) */ if (d_log_xst.tag || !tag || @@ -918,7 +918,7 @@ d_log_open(char *tag, int maxfac_hint, int default_mask, int stderr_mask, d_agetenv_str(&env, D_LOG_STDERR_IN_LOG_ENV); if (env != NULL && atoi(env) > 0) merge_stderr = true; - d_free_env_str(&env); + d_freeenv_str(&env); if (!truncate) log_flags |= O_APPEND; @@ -1107,7 +1107,7 @@ bool d_logfac_is_enabled(const char *fac_name) rc = true; out: - d_free_env_str(&ddsubsys_env); + d_freeenv_str(&ddsubsys_env); return rc; } diff --git a/src/gurt/fault_inject.c b/src/gurt/fault_inject.c index 4ffbd2c40e4f..f5225bab3dd5 100644 --- a/src/gurt/fault_inject.c +++ b/src/gurt/fault_inject.c @@ -616,7 +616,7 @@ d_fault_inject_init(void) out: if (fp) fclose(fp); - d_free_env_str(&config_file); + d_freeenv_str(&config_file); return rc; } diff --git a/src/gurt/misc.c b/src/gurt/misc.c index ffb1a85bb0d4..d92055a905d4 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1120,7 +1120,7 @@ d_agetenv_str(char **str_val, const char *name) * \param[in,out] str_val Copy of an environment string value. */ void -d_free_env_str(char **str_val) +d_freeenv_str(char **str_val) { assert(str_val != NULL); @@ -1302,6 +1302,21 @@ d_getenv_uint(const char *name, unsigned *uint_val) return -DER_SUCCESS; } +/** + * get an unsigned integer type environment variables. + * + * \param[in] name name of the environment variable. + * \param[in,out] uint_val returned value of the ENV. Will not change the original + * value if ENV is not set or set as a non-integer value. + * \return 0 on success, a negative value on error. + * \deprecated d_getenv_int() is deprecated, please use d_getenv_uint(). + */ +int +d_getenv_int(const char *name, unsigned *uint_val) +{ + return d_getenv_uint(name, uint_val); +} + /** * get a 32bits unsigned integer type environment variables * diff --git a/src/gurt/tests/test_gurt.c b/src/gurt/tests/test_gurt.c index ebb9a0ec7017..e9f8f4354867 100644 --- a/src/gurt/tests/test_gurt.c +++ b/src/gurt/tests/test_gurt.c @@ -2140,7 +2140,7 @@ test_d_agetenv_str(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_non_null(env); assert_string_equal(env, "bar"); - d_free_env_str(&env); + d_freeenv_str(&env); assert_null(env); getenv_return = ""; @@ -2148,7 +2148,7 @@ test_d_agetenv_str(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_non_null(env); assert_string_equal(env, ""); - d_free_env_str(&env); + d_freeenv_str(&env); assert_null(env); getenv_return = NULL; diff --git a/src/include/daos/btree.h b/src/include/daos/btree.h index 6b0e1a705381..fac8de7ec104 100644 --- a/src/include/daos/btree.h +++ b/src/include/daos/btree.h @@ -486,7 +486,10 @@ enum btr_feats { BTR_FEAT_DYNAMIC_ROOT = (1 << 2), /** Skip rebalance leaf when delete some record from the leaf. */ BTR_FEAT_SKIP_LEAF_REBAL = (1 << 3), - + /** Tree supports embedded root. */ + BTR_FEAT_EMBED_FIRST = (1 << 4), + /** Marks that the current root is an embedded value */ + BTR_FEAT_EMBEDDED = (1 << 5), /** Put new entries above this line */ /** Convenience entry for calculating mask for all feats */ BTR_FEAT_HELPER, diff --git a/src/include/daos/common.h b/src/include/daos/common.h index 7a3088b53c8a..e13b561aae53 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -851,6 +851,8 @@ enum { #define DAOS_POOL_CREATE_FAIL_STEP_UP (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6b) #define DAOS_MD_OP_PASS_NOREPLY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6c) #define DAOS_MD_OP_FAIL_NOREPLY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6d) +#define DAOS_MD_OP_PASS_NOREPLY_NEWLDR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6e) +#define DAOS_MD_OP_FAIL_NOREPLY_NEWLDR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6f) /** interoperability failure inject */ #define FLC_SMD_DF_VER (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x70) diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index 48de8fff62b7..f2b28c6a76b9 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -158,4 +158,13 @@ daos_lru_ref_add(struct daos_llink *llink) llink->ll_ref++; } +/** + * Return true if the caller is the last user of the LRU element. + */ +static inline bool +daos_lru_is_last_user(struct daos_llink *llink) +{ + return llink->ll_ref <= 2; +} + #endif diff --git a/src/include/daos/pool.h b/src/include/daos/pool.h index 370f626f1f03..665051991d81 100644 --- a/src/include/daos/pool.h +++ b/src/include/daos/pool.h @@ -50,7 +50,9 @@ #define DAOS_PO_QUERY_PROP_CHECKPOINT_FREQ (1ULL << (PROP_BIT_START + 22)) #define DAOS_PO_QUERY_PROP_CHECKPOINT_THRESH (1ULL << (PROP_BIT_START + 23)) #define DAOS_PO_QUERY_PROP_REINT_MODE (1ULL << (PROP_BIT_START + 24)) -#define DAOS_PO_QUERY_PROP_BIT_END 40 +#define DAOS_PO_QUERY_PROP_SVC_OPS_ENABLED (1ULL << (PROP_BIT_START + 25)) +#define DAOS_PO_QUERY_PROP_SVC_OPS_ENTRY_AGE (1ULL << (PROP_BIT_START + 26)) +#define DAOS_PO_QUERY_PROP_BIT_END 42 #define DAOS_PO_QUERY_PROP_ALL \ (DAOS_PO_QUERY_PROP_LABEL | DAOS_PO_QUERY_PROP_SPACE_RB | DAOS_PO_QUERY_PROP_SELF_HEAL | \ @@ -63,7 +65,8 @@ DAOS_PO_QUERY_PROP_SCRUB_THRESH | DAOS_PO_QUERY_PROP_SVC_REDUN_FAC | \ DAOS_PO_QUERY_PROP_OBJ_VERSION | DAOS_PO_QUERY_PROP_PERF_DOMAIN | \ DAOS_PO_QUERY_PROP_CHECKPOINT_MODE | DAOS_PO_QUERY_PROP_CHECKPOINT_FREQ | \ - DAOS_PO_QUERY_PROP_CHECKPOINT_THRESH | DAOS_PO_QUERY_PROP_REINT_MODE) + DAOS_PO_QUERY_PROP_CHECKPOINT_THRESH | DAOS_PO_QUERY_PROP_REINT_MODE | \ + DAOS_PO_QUERY_PROP_SVC_OPS_ENABLED | DAOS_PO_QUERY_PROP_SVC_OPS_ENTRY_AGE) /* * Version 1 corresponds to 2.2 (aggregation optimizations) diff --git a/src/include/daos_prop.h b/src/include/daos_prop.h index 1155ff9d1f11..4722a922f1d7 100644 --- a/src/include/daos_prop.h +++ b/src/include/daos_prop.h @@ -135,6 +135,10 @@ enum daos_pool_props { DAOS_PROP_PO_CHECKPOINT_THRESH, /** Reintegration mode for pool, data_sync|no_data_sync default is data_sync*/ DAOS_PROP_PO_REINT_MODE, + /** Metadata duplicate operations detection enabled (1) or disabled (0) */ + DAOS_PROP_PO_SVC_OPS_ENABLED, + /** Metadata duplicate operations SVC_OPS KVS max entry age (seconds), default 300 */ + DAOS_PROP_PO_SVC_OPS_ENTRY_AGE, DAOS_PROP_PO_MAX, }; @@ -244,12 +248,16 @@ enum { }; #define DAOS_PROP_PO_CHECKPOINT_MODE_DEFAULT DAOS_CHECKPOINT_TIMED -#define DAOS_PROP_PO_CHECKPOINT_FREQ_DEFAULT 5 /* 5 seconds */ -#define DAOS_PROP_PO_CHECKPOINT_FREQ_MIN 1 /* 1 seconds */ +#define DAOS_PROP_PO_CHECKPOINT_FREQ_DEFAULT 5 /* 5 seconds */ +#define DAOS_PROP_PO_CHECKPOINT_FREQ_MIN 1 /* 1 seconds */ #define DAOS_PROP_PO_CHECKPOINT_FREQ_MAX (1 << 20) /* 1 million seconds */ -#define DAOS_PROP_PO_CHECKPOINT_THRESH_DEFAULT 50 /* 50 % WAL capacity */ -#define DAOS_PROP_PO_CHECKPOINT_THRESH_MAX 75 /* 75 % WAL capacity */ -#define DAOS_PROP_PO_CHECKPOINT_THRESH_MIN 10 /* 10 % WAL capacity */ +#define DAOS_PROP_PO_CHECKPOINT_THRESH_DEFAULT 50 /* 50 % WAL capacity */ +#define DAOS_PROP_PO_CHECKPOINT_THRESH_MAX 75 /* 75 % WAL capacity */ +#define DAOS_PROP_PO_CHECKPOINT_THRESH_MIN 10 /* 10 % WAL capacity */ +#define DAOS_PROP_PO_SVC_OPS_ENABLED_DEFAULT 1 /* true: enabled by default */ +#define DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_DEFAULT 300 /* 300 seconds */ +#define DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MIN 150 /* 150 seconds */ +#define DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_MAX 600 /* 600 seconds */ /** self healing strategy bits */ #define DAOS_SELF_HEAL_AUTO_EXCLUDE (1U << 0) diff --git a/src/include/daos_srv/control.h b/src/include/daos_srv/control.h index e3995a2e30be..4ac9187724a6 100644 --- a/src/include/daos_srv/control.h +++ b/src/include/daos_srv/control.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -51,6 +51,7 @@ dpdk_cli_override_opts; #define NVME_CONF_SET_HOTPLUG_RANGE "hotplug_busid_range" #define NVME_CONF_SET_ACCEL_PROPS "accel_props" #define NVME_CONF_SET_SPDK_RPC_SERVER "spdk_rpc_srv" +#define NVME_CONF_SET_AUTO_FAULTY "auto_faulty" /** Supported acceleration engine settings */ #define NVME_ACCEL_NONE "none" diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 2894f22c901c..2e0c5244d9d6 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -12,6 +12,8 @@ #ifndef __DAOS_SRV_POOL_H__ #define __DAOS_SRV_POOL_H__ +#include + #include #include #include @@ -30,6 +32,8 @@ */ #define DS_POOL_OBJ_VERSION 1 +/* age of an entry in svc_ops KVS before it may be evicted */ +#define DEFAULT_SVC_OPS_ENTRY_AGE_SEC_MAX 300ULL /* * Pool object * @@ -191,6 +195,48 @@ struct ds_pool_svc_op_val { char ov_resvd[60]; }; +/* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. + * allocates the byte-stream, caller must free with D_FREE(). + */ +static inline int +ds_pool_svc_op_key_encode(struct ds_pool_svc_op_key *in, d_iov_t *enc_out) +{ + struct ds_pool_svc_op_key *out; + + /* encoding is simple for this type, just another struct ds_pool_svc_op_key */ + D_ALLOC_PTR(out); + if (out == NULL) + return -DER_NOMEM; + + out->ok_client_time = htobe64(in->ok_client_time); + uuid_copy(out->ok_client_id, in->ok_client_id); + d_iov_set(enc_out, (void *)out, sizeof(*out)); + + return 0; +} + +static inline int +ds_pool_svc_op_key_decode(d_iov_t *enc_in, struct ds_pool_svc_op_key *out) +{ + struct ds_pool_svc_op_key *in = enc_in->iov_buf; + + if (enc_in->iov_len < sizeof(struct ds_pool_svc_op_key)) + return -DER_INVAL; + + out->ok_client_time = be64toh(in->ok_client_time); + uuid_copy(out->ok_client_id, in->ok_client_id); + + return 0; +} + +struct rdb_tx; +int +ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t *cli_uuidp, + uint64_t cli_time, bool *is_dup, struct ds_pool_svc_op_val *valp); +int +ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t *cli_uuidp, + uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp); + /* Find ds_pool_child in cache, hold one reference */ struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid); /* Put the reference held by ds_pool_child_lookup() */ @@ -364,8 +410,7 @@ ds_pool_get_version(struct ds_pool *pool) int ds_start_chkpt_ult(struct ds_pool_child *child); void -ds_stop_chkpt_ult(struct ds_pool_child *child); -struct rdb_tx; + ds_stop_chkpt_ult(struct ds_pool_child *child); int ds_pool_lookup_hdl_cred(struct rdb_tx *tx, uuid_t pool_uuid, uuid_t pool_hdl_uuid, d_iov_t *cred); diff --git a/src/include/daos_srv/rdb.h b/src/include/daos_srv/rdb.h index ec8e17ede53e..63581bf84fa8 100644 --- a/src/include/daos_srv/rdb.h +++ b/src/include/daos_srv/rdb.h @@ -208,8 +208,9 @@ d_iov_t prefix ## name = { \ /** KVS classes */ enum rdb_kvs_class { - RDB_KVS_GENERIC, /**< hash-ordered byte-stream keys */ - RDB_KVS_INTEGER /**< numerically-ordered uint64_t keys */ + RDB_KVS_GENERIC, /**< hash-ordered byte-stream keys */ + RDB_KVS_INTEGER, /**< numerically-ordered uint64_t keys */ + RDB_KVS_LEXICAL /**< lexically-ordered byte-stream keys */ }; /** KVS attributes */ diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index 9c9999ba97cf..b42cdfbac38d 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -280,7 +280,7 @@ enum { /** Dynamic evtree root supported for this pool */ VOS_POOL_FEAT_DYN_ROOT = (1ULL << 2), /** Embedded value in tree root supported */ - VOS_POOL_FEAT_EMB_VALUE = (1ULL << 3), + VOS_POOL_FEAT_EMBED_FIRST = (1ULL << 3), /** Flat DKEY support enabled */ VOS_POOL_FEAT_FLAT_DKEY = (1ULL << 4), }; diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 779a547768b8..164421174a00 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -581,12 +581,15 @@ d_getenv_str(char *str_val, size_t str_size, const char *name); int d_agetenv_str(char **str_val, const char *name); void -d_free_env_str(char **str_val); +d_freeenv_str(char **str_val); int d_getenv_bool(const char *name, bool *bool_val); int d_getenv_char(const char *name, char *char_val); int +d_getenv_int(const char *name, unsigned int *uint_val) + __attribute__((deprecated("use d_getenv_uint"))); +int d_getenv_uint(const char *name, unsigned int *uint_val); int d_getenv_uint32_t(const char *name, uint32_t *uint32_val); @@ -601,13 +604,6 @@ d_unsetenv(const char *name); int d_clearenv(void); -static inline int -d_getenv_int(const char *name, unsigned int *uint_val) -{ - D_WARN("d_getenv_int() is deprecated, please use d_getenv_uint()"); - return d_getenv_uint(name, uint_val); -} - int d_write_string_buffer(struct d_string_buffer_t *buf, const char *fmt, ...); void diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index 57cf0faa7235..78db4c699e4f 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -449,13 +449,16 @@ _split_env(char *env, char **name, char **value) */ int dc_mgmt_net_cfg(const char *name) { - int rc; - char buf[SYS_INFO_BUF_SIZE]; - char *crt_timeout; - char *ofi_interface; - char *ofi_domain; - char *cli_srx_set; - struct dc_mgmt_sys_info info; + int rc; + char *crt_phy_addr_str; + char *crt_ctx_share_addr = NULL; + char *cli_srx_set = NULL; + char *crt_timeout = NULL; + char *ofi_interface; + char *ofi_interface_env = NULL; + char *ofi_domain = ""; + char *ofi_domain_env = NULL; + struct dc_mgmt_sys_info info; Mgmt__GetAttachInfoResp *resp; /* Query the agent for the CaRT network configuration parameters */ @@ -491,26 +494,34 @@ int dc_mgmt_net_cfg(const char *name) g_num_serv_ranks = resp->n_rank_uris; D_INFO("Setting number of server ranks to %d\n", g_num_serv_ranks); /* These two are always set */ - rc = d_setenv("CRT_PHY_ADDR_STR", info.provider, 1); + crt_phy_addr_str = info.provider; + rc = d_setenv("CRT_PHY_ADDR_STR", crt_phy_addr_str, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); - sprintf(buf, "%d", info.crt_ctx_share_addr); - rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1); + rc = asprintf(&crt_ctx_share_addr, "%d", info.crt_ctx_share_addr); + if (rc < 0) { + crt_ctx_share_addr = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + rc = d_setenv("CRT_CTX_SHARE_ADDR", crt_ctx_share_addr, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); /* If the server has set this, the client must use the same value. */ if (info.srv_srx_set != -1) { - sprintf(buf, "%d", info.srv_srx_set); - rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1); + rc = asprintf(&cli_srx_set, "%d", info.srv_srx_set); + if (rc < 0) { + cli_srx_set = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + rc = d_setenv("FI_OFI_RXM_USE_SRX", cli_srx_set, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); - D_INFO("Using server's value for FI_OFI_RXM_USE_SRX: %s\n", - buf); + D_INFO("Using server's value for FI_OFI_RXM_USE_SRX: %s\n", cli_srx_set); } else { /* Client may not set it if the server hasn't. */ - cli_srx_set = getenv("FI_OFI_RXM_USE_SRX"); + d_agetenv_str(&cli_srx_set, "FI_OFI_RXM_USE_SRX"); if (cli_srx_set) { D_ERROR("Client set FI_OFI_RXM_USE_SRX to %s, " "but server is unset!\n", cli_srx_set); @@ -519,21 +530,26 @@ int dc_mgmt_net_cfg(const char *name) } /* Allow client env overrides for these three */ - crt_timeout = getenv("CRT_TIMEOUT"); + d_agetenv_str(&crt_timeout, "CRT_TIMEOUT"); if (!crt_timeout) { - sprintf(buf, "%d", info.crt_timeout); - rc = d_setenv("CRT_TIMEOUT", buf, 1); + rc = asprintf(&crt_timeout, "%d", info.crt_timeout); + if (rc < 0) { + crt_timeout = NULL; + D_GOTO(cleanup, rc = -DER_NOMEM); + } + D_INFO("setenv CRT_TIMEOUT=%s\n", crt_timeout); + rc = d_setenv("CRT_TIMEOUT", crt_timeout, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); } else { - D_INFO("Using client provided CRT_TIMEOUT: %s\n", - crt_timeout); + D_DEBUG(DB_MGMT, "Using client provided CRT_TIMEOUT: %s\n", crt_timeout); } - ofi_interface = getenv("OFI_INTERFACE"); - ofi_domain = getenv("OFI_DOMAIN"); - if (!ofi_interface) { - rc = d_setenv("OFI_INTERFACE", info.interface, 1); + d_agetenv_str(&ofi_interface_env, "OFI_INTERFACE"); + d_agetenv_str(&ofi_domain_env, "OFI_DOMAIN"); + if (!ofi_interface_env) { + ofi_interface = info.interface; + rc = d_setenv("OFI_INTERFACE", ofi_interface, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -541,31 +557,39 @@ int dc_mgmt_net_cfg(const char *name) * If we use the agent as the source, client env shouldn't be allowed to override * the domain. Otherwise we could get a mismatch between interface and domain. */ - if (ofi_domain) + ofi_domain = info.domain; + if (ofi_domain_env) D_WARN("Ignoring OFI_DOMAIN '%s' because OFI_INTERFACE is not set; using " "automatic configuration instead\n", ofi_domain); - rc = d_setenv("OFI_DOMAIN", info.domain, 1); - if (rc != 0) + rc = d_setenv("OFI_DOMAIN", ofi_domain, 1); + if (rc != 0) { D_GOTO(cleanup, rc = d_errno2der(errno)); + } } else { + ofi_interface = ofi_interface_env; D_INFO("Using client provided OFI_INTERFACE: %s\n", ofi_interface); /* If the client env didn't provide a domain, we can assume we don't need one. */ - if (ofi_domain) + if (ofi_domain_env) { + ofi_domain = ofi_domain_env; D_INFO("Using client provided OFI_DOMAIN: %s\n", ofi_domain); + } } - D_INFO("Network interface: %s, Domain: %s\n", getenv("OFI_INTERFACE"), - getenv("OFI_DOMAIN")); + D_INFO("Network interface: %s, Domain: %s\n", ofi_interface, ofi_domain); D_DEBUG(DB_MGMT, "CaRT initialization with:\n" "\tCRT_PHY_ADDR_STR: %s, " "CRT_CTX_SHARE_ADDR: %s, CRT_TIMEOUT: %s\n", - getenv("CRT_PHY_ADDR_STR"), - getenv("CRT_CTX_SHARE_ADDR"), getenv("CRT_TIMEOUT")); + crt_phy_addr_str, crt_ctx_share_addr, crt_timeout); cleanup: + d_freeenv_str(&ofi_domain_env); + d_freeenv_str(&ofi_interface_env); + d_freeenv_str(&crt_timeout); + d_freeenv_str(&cli_srx_set); + d_freeenv_str(&crt_ctx_share_addr); put_attach_info(&info, resp); return rc; @@ -585,14 +609,16 @@ int dc_mgmt_net_cfg_check(const char *name) /* Client may not set it if the server hasn't. */ if (info.srv_srx_set == -1) { - cli_srx_set = getenv("FI_OFI_RXM_USE_SRX"); + d_agetenv_str(&cli_srx_set, "FI_OFI_RXM_USE_SRX"); if (cli_srx_set) { D_ERROR("Client set FI_OFI_RXM_USE_SRX to %s, " "but server is unset!\n", cli_srx_set); + d_freeenv_str(&cli_srx_set); rc = -DER_INVAL; goto out; } } + rc = 0; out: put_attach_info(&info, resp); diff --git a/src/mgmt/smd.pb-c.c b/src/mgmt/smd.pb-c.c index b3ed3284385c..de49e886e192 100644 --- a/src/mgmt/smd.pb-c.c +++ b/src/mgmt/smd.pb-c.c @@ -2833,19 +2833,19 @@ const ProtobufCEnumDescriptor ctl__nvme_dev_state__descriptor = }; static const ProtobufCEnumValue ctl__led_state__enum_values_by_number[5] = { - { "OFF", "CTL__LED_STATE__OFF", 0 }, + { "NA", "CTL__LED_STATE__NA", 0 }, { "QUICK_BLINK", "CTL__LED_STATE__QUICK_BLINK", 1 }, { "ON", "CTL__LED_STATE__ON", 2 }, { "SLOW_BLINK", "CTL__LED_STATE__SLOW_BLINK", 3 }, - { "NA", "CTL__LED_STATE__NA", 4 }, + { "OFF", "CTL__LED_STATE__OFF", 4 }, }; static const ProtobufCIntRange ctl__led_state__value_ranges[] = { {0, 0},{0, 5} }; static const ProtobufCEnumValueIndex ctl__led_state__enum_values_by_name[5] = { - { "NA", 4 }, - { "OFF", 0 }, + { "NA", 0 }, + { "OFF", 4 }, { "ON", 2 }, { "QUICK_BLINK", 1 }, { "SLOW_BLINK", 3 }, diff --git a/src/mgmt/smd.pb-c.h b/src/mgmt/smd.pb-c.h index 19ac9fc3d149..fd4ca542b604 100644 --- a/src/mgmt/smd.pb-c.h +++ b/src/mgmt/smd.pb-c.h @@ -66,9 +66,9 @@ typedef enum _Ctl__NvmeDevState { } Ctl__NvmeDevState; typedef enum _Ctl__LedState { /* - * Equivalent to SPDK_VMD_LED_STATE_OFF + * Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) */ - CTL__LED_STATE__OFF = 0, + CTL__LED_STATE__NA = 0, /* * Equivalent to SPDK_VMD_LED_STATE_IDENTIFY (4Hz blink) */ @@ -82,9 +82,9 @@ typedef enum _Ctl__LedState { */ CTL__LED_STATE__SLOW_BLINK = 3, /* - * Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) + * Equivalent to SPDK_VMD_LED_STATE_OFF */ - CTL__LED_STATE__NA = 4 + CTL__LED_STATE__OFF = 4 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(CTL__LED_STATE) } Ctl__LedState; typedef enum _Ctl__LedAction { @@ -305,7 +305,7 @@ struct _Ctl__NvmeController }; #define CTL__NVME_CONTROLLER__INIT \ { PROTOBUF_C_MESSAGE_INIT (&ctl__nvme_controller__descriptor) \ - , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0,NULL, 0,NULL, CTL__NVME_DEV_STATE__UNKNOWN, CTL__LED_STATE__OFF, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string } + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0,NULL, 0,NULL, CTL__NVME_DEV_STATE__UNKNOWN, CTL__LED_STATE__NA, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string } /* @@ -558,7 +558,7 @@ struct _Ctl__LedManageReq }; #define CTL__LED_MANAGE_REQ__INIT \ { PROTOBUF_C_MESSAGE_INIT (&ctl__led_manage_req__descriptor) \ - , (char *)protobuf_c_empty_string, CTL__LED_ACTION__GET, CTL__LED_STATE__OFF, 0 } + , (char *)protobuf_c_empty_string, CTL__LED_ACTION__GET, CTL__LED_STATE__NA, 0 } struct _Ctl__DevReplaceReq diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index f92c2411ff8a..1ae15974dc51 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1973,8 +1973,12 @@ ds_mgmt_smd_free_dev(Ctl__SmdDevice *dev) D_FREE(dev->ctrlr->fw_rev); D_FREE(dev->ctrlr->vendor_id); D_FREE(dev->ctrlr->pci_dev_type); - if (dev->ctrlr->namespaces != NULL) + if (dev->ctrlr->namespaces != NULL) { D_FREE(dev->ctrlr->namespaces[0]); + D_FREE(dev->ctrlr->namespaces); + dev->ctrlr->namespaces = NULL; + dev->ctrlr->n_namespaces = 0; + } } } diff --git a/src/mgmt/srv_query.c b/src/mgmt/srv_query.c index 7bfd31f85d85..ff76b77a767e 100644 --- a/src/mgmt/srv_query.c +++ b/src/mgmt/srv_query.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -338,7 +338,7 @@ ctrlr_reset_str_fields(Ctl__NvmeController *ctrlr) static int add_ctrlr_details(Ctl__NvmeController *ctrlr, struct bio_dev_info *dev_info) { - int rc; + int rc = 0; rc = copy_str2ctrlr(&ctrlr->pci_addr, dev_info->bdi_traddr); if (rc != 0) @@ -364,6 +364,32 @@ add_ctrlr_details(Ctl__NvmeController *ctrlr, struct bio_dev_info *dev_info) ctrlr->model, ctrlr->serial, ctrlr->fw_rev, ctrlr->vendor_id, ctrlr->pci_dev_type, ctrlr->socket_id); + /* Populate NVMe namespace id and capacity */ + + if (dev_info->bdi_ctrlr->nss == NULL) { + D_ERROR("nss not initialized in bio_dev_info"); + return -DER_INVAL; + } + D_ASSERT(dev_info->bdi_ctrlr->nss->next == NULL); + + /* When describing a SMD, only one NVMe namespace is relevant */ + D_ALLOC_ARRAY(ctrlr->namespaces, 1); + if (ctrlr->namespaces == NULL) { + return -DER_NOMEM; + } + D_ALLOC_PTR(ctrlr->namespaces[0]); + if (ctrlr->namespaces[0] == NULL) { + return -DER_NOMEM; + } + ctrlr->n_namespaces = 1; + ctl__nvme_controller__namespace__init(ctrlr->namespaces[0]); + + ctrlr->namespaces[0]->id = dev_info->bdi_ctrlr->nss->id; + ctrlr->namespaces[0]->size = dev_info->bdi_ctrlr->nss->size; + + D_DEBUG(DB_MGMT, "ns id/size: '%d' '%ld'\n", ctrlr->namespaces[0]->id, + ctrlr->namespaces[0]->size); + return 0; } @@ -426,12 +452,6 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) for (j = 0; j < dev_info->bdi_tgt_cnt; j++) resp->devices[i]->tgt_ids[j] = dev_info->bdi_tgts[j]; - if (dev_info->bdi_ctrlr == NULL) { - D_ERROR("ctrlr not initialized in bio_dev_info"); - rc = -DER_INVAL; - break; - } - /* Populate NVMe controller details */ D_ALLOC_PTR(resp->devices[i]->ctrlr); @@ -443,40 +463,14 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) /* Set string fields to NULL to allow D_FREE to work as expected on cleanup */ ctrlr_reset_str_fields(resp->devices[i]->ctrlr); - rc = add_ctrlr_details(resp->devices[i]->ctrlr, dev_info); - if (rc != 0) - break; - - /* Populate NVMe namespace id and capacity */ - - if (dev_info->bdi_ctrlr->nss == NULL) { - D_ERROR("nss not initialized in bio_dev_info"); - rc = -DER_INVAL; - break; - } - D_ASSERT(dev_info->bdi_ctrlr->nss->next == NULL); - - /* When describing a SMD, only one NVMe namespace is relevant */ - D_ALLOC_ARRAY(resp->devices[i]->ctrlr->namespaces, 1); - if (resp->devices[i]->ctrlr->namespaces == NULL) { - rc = -DER_NOMEM; - break; - } - D_ALLOC_PTR(resp->devices[i]->ctrlr->namespaces[0]); - if (resp->devices[i]->ctrlr->namespaces[0] == NULL) { - rc = -DER_NOMEM; - break; + if (dev_info->bdi_ctrlr != NULL) { + rc = add_ctrlr_details(resp->devices[i]->ctrlr, dev_info); + if (rc != 0) + break; + resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id; + } else { + D_DEBUG(DB_MGMT, "ctrlr not initialized in bio_dev_info, unplugged?"); } - resp->devices[i]->ctrlr->n_namespaces = 1; - ctl__nvme_controller__namespace__init(resp->devices[i]->ctrlr->namespaces[0]); - - resp->devices[i]->ctrlr->namespaces[0]->id = dev_info->bdi_ctrlr->nss->id; - resp->devices[i]->ctrlr->namespaces[0]->size = dev_info->bdi_ctrlr->nss->size; - resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id; - - D_DEBUG(DB_MGMT, "ns id/size: '%d' '%ld'\n", - resp->devices[i]->ctrlr->namespaces[0]->id, - resp->devices[i]->ctrlr->namespaces[0]->size); /* Populate NVMe device state */ @@ -484,7 +478,6 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp) resp->devices[i]->ctrlr->dev_state = CTL__NVME_DEV_STATE__UNPLUGGED; goto next_dev; } - if ((dev_info->bdi_flags & NVME_DEV_FL_FAULTY) != 0) resp->devices[i]->ctrlr->dev_state = CTL__NVME_DEV_STATE__EVICTED; else if ((dev_info->bdi_flags & NVME_DEV_FL_INUSE) == 0) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 088e87067c47..95c1c22d5832 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -7060,7 +7060,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc != 0) goto out; + leader = coa->coa_dct_nr; + if (auxi->io_retry) { + if (unlikely(spa->pa_auxi.shard >= obj->cob_shards_nr)) + goto new_leader; + /* Try to reuse the same leader. */ rc = obj_shard_open(obj, spa->pa_auxi.shard, map_ver, &shard); if (rc == 0) { @@ -7078,10 +7083,13 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo /* Then change to new leader for retry. */ } - /* Randomly select a rank as the leader. */ - leader = d_rand() % coa->coa_dct_nr; - new_leader: + if (leader == coa->coa_dct_nr) + /* Randomly select a rank as the leader. */ + leader = d_rand() % coa->coa_dct_nr; + else + leader = (leader + 1) % coa->coa_dct_nr; + dct = &coa->coa_dcts[leader]; len = dct->dct_bitmap_sz << 3; @@ -7098,8 +7106,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo } } - /* Try another for leader. */ - leader = (leader + 1) % coa->coa_dct_nr; goto new_leader; gen_mbs: diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 71aedeca8954..20e1570e6d4d 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1305,9 +1305,9 @@ agg_peer_update_ult(void *arg) iod.iod_size = entry->ae_rsize; obj = obj_hdl2ptr(entry->ae_obj_hdl); for (peer = 0; peer < p; peer++) { - /* Only update the available parities */ - if (peer == pidx || entry->ae_peer_pshards[peer].sd_rank == DAOS_TGT_IGNORE) + if (peer == pidx) continue; + D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE); tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; enqueue_id = 0; @@ -1443,6 +1443,7 @@ agg_peer_update(struct ec_agg_entry *entry, bool write_parity) struct daos_shard_loc *peer_loc; uint32_t failed_tgts_cnt = 0; uint32_t p = ec_age2p(entry); + uint32_t pidx = ec_age2pidx(entry); uint32_t peer; int i, tid, rc = 0; @@ -1464,24 +1465,19 @@ agg_peer_update(struct ec_agg_entry *entry, bool write_parity) return rc; } - rc = agg_get_obj_handle(entry); - if (rc) { - D_ERROR("Failed to open object: "DF_RC"\n", DP_RC(rc)); - goto out; - } - if (targets != NULL) { for (peer = 0; peer < p; peer++) { + if (peer == pidx) + continue; peer_loc = &entry->ae_peer_pshards[peer]; for (i = 0; i < failed_tgts_cnt; i++) { - if (targets[i].ta_comp.co_rank == peer_loc->sd_rank || - peer_loc->sd_rank == DAOS_TGT_IGNORE) { - D_DEBUG(DB_EPC, DF_UOID" peer parity " - "tgt gailed rank %d, tgt_idx " - "%d.\n", DP_UOID(entry->ae_oid), - peer_loc->sd_rank, - peer_loc->sd_tgt_idx); - goto out; + if (peer_loc->sd_rank == DAOS_TGT_IGNORE || + (targets[i].ta_comp.co_rank == peer_loc->sd_rank && + targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx)) { + D_DEBUG(DB_EPC, DF_UOID" peer parity tgt failed rank %d, " + "tgt_idx %d.\n", DP_UOID(entry->ae_oid), + peer_loc->sd_rank, peer_loc->sd_tgt_idx); + D_GOTO(out, rc = -1); } } } @@ -1640,7 +1636,10 @@ agg_process_holes_ult(void *arg) continue; for (i = 0; targets && i < failed_tgts_cnt; i++) { - if (targets[i].ta_comp.co_rank == entry->ae_peer_pshards[peer].sd_rank) { + if (entry->ae_peer_pshards[peer].sd_rank == DAOS_TGT_IGNORE || + (targets[i].ta_comp.co_rank == entry->ae_peer_pshards[peer].sd_rank && + targets[i].ta_comp.co_index == + entry->ae_peer_pshards[peer].sd_tgt_idx)) { D_ERROR(DF_UOID" peer %d parity tgt failed\n", DP_UOID(entry->ae_oid), peer); rc = -1; diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 9cd0473effd5..8afead2ee51b 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -699,7 +699,7 @@ obj_set_reply_nrs(crt_rpc_t *rpc, daos_handle_t ioh, d_sg_list_t *echo_sgl, uint uint32_t nrs_count = orw->orw_nr; int i, j, idx; - if (nrs_count == 0) + if (nrs_count == 0 || (orw->orw_flags & ORF_CHECK_EXISTENCE)) return 0; /* Re-entry case. */ @@ -1409,11 +1409,11 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io struct daos_recx_ep_list *shadows = NULL; bulk_op = CRT_BULK_PUT; + if (orw->orw_flags & ORF_CHECK_EXISTENCE) + fetch_flags = VOS_OF_FETCH_CHECK_EXISTENCE; if (!rma && orw->orw_sgls.ca_arrays == NULL) { spec_fetch = true; - if (orw->orw_flags & ORF_CHECK_EXISTENCE) - fetch_flags = VOS_OF_FETCH_CHECK_EXISTENCE; - else + if (!(orw->orw_flags & ORF_CHECK_EXISTENCE)) fetch_flags = VOS_OF_FETCH_SIZE_ONLY; } diff --git a/src/pool/rpc.c b/src/pool/rpc.c index 3ad652123415..667e5185c62c 100644 --- a/src/pool/rpc.c +++ b/src/pool/rpc.c @@ -354,6 +354,12 @@ pool_query_bits(daos_pool_info_t *po_info, daos_prop_t *prop) case DAOS_PROP_PO_REINT_MODE: bits |= DAOS_PO_QUERY_PROP_REINT_MODE; break; + case DAOS_PROP_PO_SVC_OPS_ENABLED: + bits |= DAOS_PO_QUERY_PROP_SVC_OPS_ENABLED; + break; + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: + bits |= DAOS_PO_QUERY_PROP_SVC_OPS_ENTRY_AGE; + break; default: D_ERROR("ignore bad dpt_type %d.\n", entry->dpe_type); break; diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index c95f15cc7159..1d96e5d1645b 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -92,6 +92,8 @@ struct pool_iv_prop { uint32_t pip_svc_list_offset; uint32_t pip_perf_domain; uint32_t pip_reint_mode; + uint32_t pip_svc_ops_enabled; + uint32_t pip_svc_ops_entry_age; char pip_iv_buf[0]; }; @@ -255,10 +257,6 @@ int ds_pool_iv_srv_hdl_invalidate(struct ds_pool *pool); int ds_pool_iv_conn_hdl_fetch(struct ds_pool *pool); int ds_pool_iv_conn_hdl_invalidate(struct ds_pool *pool, uuid_t hdl_uuid); -int ds_pool_iv_srv_hdl_fetch_non_sys(struct ds_pool *pool, - uuid_t *srv_cont_hdl, - uuid_t *srv_pool_hdl); - /* * srv_metrics.c */ diff --git a/src/pool/srv_iv.c b/src/pool/srv_iv.c index 92970ff3d5fd..263fbd613f46 100644 --- a/src/pool/srv_iv.c +++ b/src/pool/srv_iv.c @@ -220,6 +220,12 @@ pool_iv_prop_l2g(daos_prop_t *prop, struct pool_iv_prop *iv_prop) case DAOS_PROP_PO_REINT_MODE: iv_prop->pip_reint_mode = prop_entry->dpe_val; break; + case DAOS_PROP_PO_SVC_OPS_ENABLED: + iv_prop->pip_svc_ops_enabled = prop_entry->dpe_val; + break; + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: + iv_prop->pip_svc_ops_entry_age = prop_entry->dpe_val; + break; default: D_ASSERTF(0, "bad dpe_type %d\n", prop_entry->dpe_type); break; @@ -360,6 +366,12 @@ pool_iv_prop_g2l(struct pool_iv_prop *iv_prop, daos_prop_t *prop) case DAOS_PROP_PO_REINT_MODE: prop_entry->dpe_val = iv_prop->pip_reint_mode; break; + case DAOS_PROP_PO_SVC_OPS_ENABLED: + prop_entry->dpe_val = iv_prop->pip_svc_ops_enabled; + break; + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: + prop_entry->dpe_val = iv_prop->pip_svc_ops_entry_age; + break; default: D_ASSERTF(0, "bad dpe_type %d\n", prop_entry->dpe_type); break; @@ -1496,63 +1508,6 @@ ds_pool_iv_srv_hdl_fetch(struct ds_pool *pool, uuid_t *pool_hdl_uuid, return rc; } -struct srv_hdl_ult_arg { - struct ds_pool *pool; - ABT_eventual eventual; -}; - -static void -pool_iv_srv_hdl_fetch_ult(void *data) -{ - struct srv_hdl_ult_arg *arg = data; - int rc; - - rc = ds_pool_iv_srv_hdl_fetch(arg->pool, NULL, NULL); - - ABT_eventual_set(arg->eventual, (void *)&rc, sizeof(rc)); -} - -int -ds_pool_iv_srv_hdl_fetch_non_sys(struct ds_pool *pool, uuid_t *srv_cont_hdl, - uuid_t *srv_pool_hdl) -{ - struct srv_hdl_ult_arg arg; - ABT_eventual eventual; - int *status; - int rc; - - /* Fetch the capability from the leader. To avoid extra locks, - * all metadatas are maintained by xstream 0, so let's create - * an ULT on xstream 0 to let xstream 0 to handle capa fetch - * and update. - */ - rc = ABT_eventual_create(sizeof(*status), &eventual); - if (rc != ABT_SUCCESS) - return dss_abterr2der(rc); - - arg.pool = pool; - arg.eventual = eventual; - rc = dss_ult_create(pool_iv_srv_hdl_fetch_ult, &arg, DSS_XS_SYS, - 0, 0, NULL); - if (rc) - D_GOTO(out_eventual, rc); - - rc = ABT_eventual_wait(eventual, (void **)&status); - if (rc != ABT_SUCCESS) - D_GOTO(out_eventual, rc = dss_abterr2der(rc)); - if (*status != 0) - D_GOTO(out_eventual, rc = *status); - - if (srv_cont_hdl) - uuid_copy(*srv_cont_hdl, pool->sp_srv_cont_hdl); - if (srv_pool_hdl) - uuid_copy(*srv_pool_hdl, pool->sp_srv_pool_hdl); - -out_eventual: - ABT_eventual_free(&eventual); - return rc; -} - int ds_pool_iv_prop_update(struct ds_pool *pool, daos_prop_t *prop) { diff --git a/src/pool/srv_layout.c b/src/pool/srv_layout.c index 98f434389c42..45fef96249cf 100644 --- a/src/pool/srv_layout.c +++ b/src/pool/srv_layout.c @@ -28,6 +28,9 @@ RDB_STRING_KEY(ds_pool_prop_, connectable); RDB_STRING_KEY(ds_pool_prop_, nhandles); RDB_STRING_KEY(ds_pool_prop_, svc_ops); RDB_STRING_KEY(ds_pool_prop_, svc_ops_enabled); +RDB_STRING_KEY(ds_pool_prop_, svc_ops_max); +RDB_STRING_KEY(ds_pool_prop_, svc_ops_num); +RDB_STRING_KEY(ds_pool_prop_, svc_ops_age); /** pool handle KVS */ RDB_STRING_KEY(ds_pool_prop_, handles); @@ -152,7 +155,14 @@ struct daos_prop_entry pool_prop_entries_default[DAOS_PROP_PO_NUM] = { .dpe_type = DAOS_PROP_PO_REINT_MODE, .dpe_val = DAOS_PROP_PO_REINT_MODE_DEFAULT, }, -}; + { + .dpe_type = DAOS_PROP_PO_SVC_OPS_ENABLED, + .dpe_val = DAOS_PROP_PO_SVC_OPS_ENABLED_DEFAULT, + }, + { + .dpe_type = DAOS_PROP_PO_SVC_OPS_ENTRY_AGE, + .dpe_val = DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_DEFAULT, + }}; daos_prop_t pool_prop_default = { .dpp_nr = DAOS_PROP_PO_NUM, diff --git a/src/pool/srv_layout.h b/src/pool/srv_layout.h index 370fc8894f7e..41228668f900 100644 --- a/src/pool/srv_layout.h +++ b/src/pool/srv_layout.h @@ -40,9 +40,9 @@ * * extern d_iov_t ds_pool_prop_new_key; comment_on_value_type * - * Note 1. The "new_key" name in ds_pool_prop_new_key must not appear (with very few exceptions) - * in the root KVS in src/container/srv_layout.h, that is, there must not usually be - * a ds_cont_prop_new_key, because the two root KVSs are the same RDB KVS. + * Note 1. The "new_key" name in ds_pool_prop_new_key must not appear in the root KVS in + * src/container/srv_layout.h, that is, there must not be a ds_cont_prop_new_key, because the two + * root KVSs are the same RDB KVS. * * Note 2. The comment_on_value_type shall focus on the value type only; * usage shall be described above in this comment following existing @@ -79,8 +79,11 @@ extern d_iov_t ds_pool_prop_checkpoint_mode; /* uint32_t */ extern d_iov_t ds_pool_prop_checkpoint_freq; /* uint32_t */ extern d_iov_t ds_pool_prop_checkpoint_thresh; /* uint32_t */ extern d_iov_t ds_pool_prop_reint_mode; /* uint32_t */ -extern d_iov_t ds_pool_prop_svc_ops; /* service ops KVS - common to pool, container */ -extern d_iov_t ds_pool_prop_svc_ops_enabled; /* uint32_t - common to pool, container */ +extern d_iov_t ds_pool_prop_svc_ops; /* service ops KVS */ +extern d_iov_t ds_pool_prop_svc_ops_enabled; /* uint32_t */ +extern d_iov_t ds_pool_prop_svc_ops_max; /* uint32_t */ +extern d_iov_t ds_pool_prop_svc_ops_num; /* uint32_t */ +extern d_iov_t ds_pool_prop_svc_ops_age; /* uint32_t */ /* Please read the IMPORTANT notes above before adding new keys. */ /* diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 6396e6bea590..40a84cd8a4bd 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -41,6 +41,8 @@ #define DAOS_POOL_GLOBAL_VERSION_WITH_HDL_CRED 1 #define DAOS_POOL_GLOBAL_VERSION_WITH_SVC_OPS_KVS 3 +#define PS_OPS_PER_SEC 4096 + /* * Return the corresponding VOS pool DF version or 0 if pool_global_version is * not supported. @@ -166,23 +168,24 @@ sched_cancel_and_wait(struct pool_svc_sched *sched) /* Pool service */ struct pool_svc { struct ds_rsvc ps_rsvc; - uuid_t ps_uuid; /* pool UUID */ - struct cont_svc *ps_cont_svc; /* one combined svc for now */ - ABT_rwlock ps_lock; /* for DB data */ - rdb_path_t ps_root; /* root KVS */ - rdb_path_t ps_handles; /* pool handle KVS */ - rdb_path_t ps_user; /* pool user attributes KVS */ - rdb_path_t ps_ops; /* metadata ops KVS */ + uuid_t ps_uuid; /* pool UUID */ + struct cont_svc *ps_cont_svc; /* one combined svc for now */ + ABT_rwlock ps_lock; /* for DB data */ + rdb_path_t ps_root; /* root KVS */ + rdb_path_t ps_handles; /* pool handle KVS */ + rdb_path_t ps_user; /* pool user attributes KVS */ + rdb_path_t ps_ops; /* metadata ops KVS */ struct ds_pool *ps_pool; struct pool_svc_events ps_events; uint32_t ps_global_version; int ps_svc_rf; - bool ps_force_notify;/* MS of PS membership */ + bool ps_force_notify; /* MS of PS membership */ struct pool_svc_sched ps_reconf_sched; - /* Check all containers RF for the pool */ - struct pool_svc_sched ps_rfcheck_sched; - /* The global pool map version on all pool targets */ - uint32_t ps_global_map_version; + struct pool_svc_sched ps_rfcheck_sched; /* Check all containers RF for the pool */ + uint32_t ps_global_map_version; /* global pool map version on all targets */ + uint32_t ps_ops_enabled; /* cached ds_pool_prop_svc_ops_enabled */ + uint32_t ps_ops_max; /* cached ds_pool_prop_svc_ops_max */ + uint32_t ps_ops_age; /* cached ds_pool_prop_svc_ops_age */ }; /* Pool service failed to start */ @@ -367,6 +370,8 @@ pool_prop_default_copy(daos_prop_t *prop_def, daos_prop_t *prop) case DAOS_PROP_PO_RP_PDA: case DAOS_PROP_PO_SVC_REDUN_FAC: case DAOS_PROP_PO_PERF_DOMAIN: + case DAOS_PROP_PO_SVC_OPS_ENABLED: + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: entry_def->dpe_val = entry->dpe_val; break; case DAOS_PROP_PO_POLICY: @@ -668,6 +673,20 @@ pool_prop_write(struct rdb_tx *tx, const rdb_path_t *kvs, daos_prop_t *prop) if (rc) return rc; break; + case DAOS_PROP_PO_SVC_OPS_ENABLED: + val32 = entry->dpe_val; + d_iov_set(&value, &val32, sizeof(val32)); + rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_enabled, &value); + if (rc) + return rc; + break; + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: + val32 = entry->dpe_val; + d_iov_set(&value, &val32, sizeof(val32)); + rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_age, &value); + if (rc) + return rc; + break; default: D_ERROR("bad dpe_type %d.\n", entry->dpe_type); return -DER_INVAL; @@ -694,7 +713,11 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co struct rdb_kvs_attr attr; int ntargets = nnodes * dss_tgt_nr; uint32_t upgrade_global_version = DAOS_POOL_GLOBAL_VERSION; - uint32_t svc_ops_enabled = 0; + uint32_t svc_ops_enabled = 1; + /* max number of entries in svc_ops KVS: equivalent of max age (sec) x PS_OPS_PER_SEC */ + uint32_t svc_ops_age = DAOS_PROP_PO_SVC_OPS_ENTRY_AGE_DEFAULT; + uint32_t svc_ops_max; + uint32_t svc_ops_num; uint64_t rdb_size; int rc; struct daos_prop_entry *entry; @@ -775,6 +798,8 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co } /* Create pool service operations KVS */ + attr.dsa_class = RDB_KVS_LEXICAL; + attr.dsa_order = 16; rc = rdb_tx_create_kvs(tx, kvs, &ds_pool_prop_svc_ops, &attr); if (rc != 0) { D_ERROR("failed to create service ops KVS, " DF_RC "\n", DP_RC(rc)); @@ -782,15 +807,50 @@ init_pool_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, uint32_t nnodes, co } /* Determine if duplicate service operations detection will be enabled */ - rc = rdb_get_size(tx->dt_db, &rdb_size); - if (rc != 0) - goto out_map_buf; - if (rdb_size >= DUP_OP_MIN_RDB_SIZE) - svc_ops_enabled = 1; + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENABLED); + if (entry) + svc_ops_enabled = entry->dpe_val; + if (svc_ops_enabled) { + rc = rdb_get_size(tx->dt_db, &rdb_size); + if (rc != 0) + goto out_map_buf; + if (rdb_size < DUP_OP_MIN_RDB_SIZE) { + svc_ops_enabled = 0; + D_WARN("pool duplicate ops detection disabled due to rdb size %zu < %u\n", + rdb_size, DUP_OP_MIN_RDB_SIZE); + } + } d_iov_set(&value, &svc_ops_enabled, sizeof(svc_ops_enabled)); rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_enabled, &value); + if (rc != 0) { + DL_ERROR(rc, "failed to set svc_ops_enabled"); + goto out_map_buf; + } + + /* Maximum number of RPCs that may be kept in svc_ops, from SVC_OPS_ENTRY_AGE property. + * Default: PS_OPS_PER_SEC x DEFAULT_SVC_OPS_ENTRY_AGE_SEC. + */ + entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENTRY_AGE); + if (entry) + svc_ops_age = entry->dpe_val; + svc_ops_max = PS_OPS_PER_SEC * svc_ops_age; + svc_ops_num = 0; + d_iov_set(&value, &svc_ops_age, sizeof(svc_ops_age)); + rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_age, &value); + if (rc != 0) { + DL_ERROR(rc, "failed to set svc_ops_age"); + goto out_map_buf; + } + d_iov_set(&value, &svc_ops_max, sizeof(svc_ops_max)); + rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_max, &value); + if (rc != 0) { + DL_ERROR(rc, "failed to set svc_ops_max"); + goto out_map_buf; + } + d_iov_set(&value, &svc_ops_num, sizeof(svc_ops_num)); + rc = rdb_tx_update(tx, kvs, &ds_pool_prop_svc_ops_num, &value); if (rc != 0) - D_ERROR("failed to set svc_ops_enabled, " DF_RC "\n", DP_RC(rc)); + DL_ERROR(rc, "failed to set svc_ops_num"); out_map_buf: pool_buf_free(map_buf); @@ -1517,6 +1577,8 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf, bool version_exists = false; bool rdb_size_ok = false; uint32_t svc_ops_enabled = 0; + uint32_t svc_ops_max = 0; + uint32_t svc_ops_age = 0; uint64_t rdb_size; struct daos_prop_entry *svc_rf_entry; int rc; @@ -1616,10 +1678,33 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf, DP_UUID(svc->ps_uuid), DP_RC(rc)); goto out_lock; } + svc->ps_ops_enabled = svc_ops_enabled; - D_DEBUG(DB_MD, DF_UUID ": duplicate ops detection %s (rdb size " DF_U64 " %s %u minimum)\n", + d_iov_set(&value, &svc_ops_max, sizeof(svc_ops_max)); + rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_svc_ops_max, &value); + if (rc == -DER_NONEXIST) { + rc = 0; + } else if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_max", DP_UUID(svc->ps_uuid)); + goto out_lock; + } + svc->ps_ops_max = svc_ops_max; + + d_iov_set(&value, &svc_ops_age, sizeof(svc_ops_age)); + rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_svc_ops_age, &value); + if (rc == -DER_NONEXIST) { + rc = 0; + } else if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_age", DP_UUID(svc->ps_uuid)); + goto out_lock; + } + svc->ps_ops_age = svc_ops_age; + + D_DEBUG(DB_MD, + DF_UUID ": duplicate ops detection %s (rdb size " DF_U64 " %s %u minimum), " + "max entries %u, max entry age %u sec\n", DP_UUID(svc->ps_uuid), svc_ops_enabled ? "enabled" : "disabled", rdb_size, - rdb_size_ok ? ">=" : "<", DUP_OP_MIN_RDB_SIZE); + rdb_size_ok ? ">=" : "<", DUP_OP_MIN_RDB_SIZE, svc_ops_max, svc_ops_age); out_lock: ABT_rwlock_unlock(svc->ps_lock); @@ -1810,6 +1895,11 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) } else { uuid_generate(pool_hdl_uuid); uuid_generate(cont_hdl_uuid); + /* Only copy server handle to make is_from_srv() check correctly, and + * container server handle will not be copied here, otherwise + * ds_pool_iv_refresh_hdl will not open the server container handle. + */ + uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid); } rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, @@ -2534,6 +2624,7 @@ pool_prop_read(struct rdb_tx *tx, const struct pool_svc *svc, uint64_t bits, prop->dpp_entries[idx].dpe_val = val; idx++; } + if (bits & DAOS_PO_QUERY_PROP_SCRUB_FREQ) { d_iov_set(&value, &val, sizeof(val)); rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_scrub_freq, @@ -2550,6 +2641,7 @@ pool_prop_read(struct rdb_tx *tx, const struct pool_svc *svc, uint64_t bits, prop->dpp_entries[idx].dpe_val = val; idx++; } + if (bits & DAOS_PO_QUERY_PROP_SCRUB_THRESH) { d_iov_set(&value, &val, sizeof(val)); rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_scrub_thresh, @@ -2598,6 +2690,7 @@ pool_prop_read(struct rdb_tx *tx, const struct pool_svc *svc, uint64_t bits, prop->dpp_entries[idx].dpe_val = val32; idx++; } + if (bits & DAOS_PO_QUERY_PROP_CHECKPOINT_FREQ) { d_iov_set(&value, &val32, sizeof(val32)); rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_checkpoint_freq, &value); @@ -2613,6 +2706,7 @@ pool_prop_read(struct rdb_tx *tx, const struct pool_svc *svc, uint64_t bits, prop->dpp_entries[idx].dpe_val = val32; idx++; } + if (bits & DAOS_PO_QUERY_PROP_CHECKPOINT_THRESH) { d_iov_set(&value, &val32, sizeof(val32)); rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_checkpoint_thresh, &value); @@ -2648,6 +2742,48 @@ pool_prop_read(struct rdb_tx *tx, const struct pool_svc *svc, uint64_t bits, idx++; } + if (bits & DAOS_PO_QUERY_PROP_SVC_OPS_ENABLED) { + d_iov_set(&value, &val32, sizeof(val32)); + rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_svc_ops_enabled, &value); + if (rc == -DER_NONEXIST && global_ver < DAOS_POOL_GLOBAL_VERSION_WITH_SVC_OPS_KVS) { + /* needs to be upgraded */ + rc = 0; + val32 = 0; + prop->dpp_entries[idx].dpe_flags |= DAOS_PROP_ENTRY_NOT_SET; + } else if (rc != 0) { + DL_ERROR(rc, DF_UUID ": DAOS_PROP_PO_SVC_OPS_ENABLED missing from the pool", + DP_UUID(svc->ps_uuid)); + D_GOTO(out_prop, rc); + } + if (rc != 0) + D_GOTO(out_prop, rc); + D_ASSERT(idx < nr); + prop->dpp_entries[idx].dpe_type = DAOS_PROP_PO_SVC_OPS_ENABLED; + prop->dpp_entries[idx].dpe_val = val32; + idx++; + } + + if (bits & DAOS_PO_QUERY_PROP_SVC_OPS_ENTRY_AGE) { + d_iov_set(&value, &val32, sizeof(val32)); + rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_svc_ops_age, &value); + if (rc == -DER_NONEXIST && global_ver < DAOS_POOL_GLOBAL_VERSION_WITH_SVC_OPS_KVS) { + /* needs to be upgraded */ + rc = 0; + val32 = 0; + prop->dpp_entries[idx].dpe_flags |= DAOS_PROP_ENTRY_NOT_SET; + } else if (rc != 0) { + DL_ERROR(rc, DF_UUID ": DAOS_PROP_PO_SVC_OPS_ENTRY_AGE missing from pool", + DP_UUID(svc->ps_uuid)); + D_GOTO(out_prop, rc); + } + if (rc != 0) + D_GOTO(out_prop, rc); + D_ASSERT(idx < nr); + prop->dpp_entries[idx].dpe_type = DAOS_PROP_PO_SVC_OPS_ENTRY_AGE; + prop->dpp_entries[idx].dpe_val = val32; + idx++; + } + *prop_out = prop; return 0; @@ -2687,145 +2823,282 @@ pool_op_is_write(crt_opcode_t opc) return is_write; } +#if 0 +/* DEBUG */ +static int +pool_op_iter_cb(daos_handle_t ih, d_iov_t *key_enc, d_iov_t *val, void *arg) +{ + struct ds_pool_svc_op_key op_key; + struct ds_pool_svc_op_val *op_val = val->iov_buf; + + ds_pool_svc_op_key_decode(key_enc, &op_key); + + D_DEBUG(DB_MD, "key: time=" DF_X64 ", cli=" DF_UUID ", rc=%d\n", + op_key.ok_client_time, DP_UUID(op_key.ok_client_id), op_val->ov_rc); + + return 0; +} +#endif + +static int +pool_op_check_delete_oldest(struct rdb_tx *tx, struct pool_svc *svc, bool dup_op, + uint32_t *svc_ops_num) +{ + int rc; + d_iov_t key1_enc; + struct ds_pool_svc_op_key k1; + uint64_t t1_sec; + uint64_t t2_sec; + uint64_t age_sec; + + if (svc->ps_ops_enabled == 0) + return 0; + + d_iov_set(&key1_enc, NULL, 0); + rc = rdb_tx_fetch(tx, &svc->ps_ops, RDB_PROBE_FIRST, NULL /* key_in */, &key1_enc, + NULL /* value */); + if (rc == -DER_NONEXIST) + return 0; + else if (rc != 0) { + DL_ERROR(rc, "failed to probe first ps_ops entry"); + return rc; + } + + rc = ds_pool_svc_op_key_decode(&key1_enc, &k1); + if (rc != 0) { + DL_ERROR(rc, "key decode failed"); + return rc; + } + + /* If number of RPCs is at the limit, or the oldest is more than ps_ops_age old, + * delete the oldest entry. TODO: evict many/all such entries (during periodic cleanup?). + */ + t1_sec = d_hlc2sec(k1.ok_client_time); + t2_sec = d_hlc2sec(d_hlc_get()); + age_sec = t2_sec - t1_sec; + + if ((*svc_ops_num < svc->ps_ops_max) && (age_sec <= svc->ps_ops_age)) + return 0; + + D_DEBUG(DB_MD, DF_UUID ": will delete oldest entry, svc_ops_num=%u, age=%zu sec\n", + DP_UUID(svc->ps_uuid), *svc_ops_num, age_sec); + rc = rdb_tx_delete(tx, &svc->ps_ops, &key1_enc); + if (rc != 0) { + DL_ERROR(rc, "failed to delete oldest entry in ps_ops"); + return rc; + } + + *svc_ops_num -= 1; + return 0; +} + /* Check if this is a duplicate/retry operation that was already done, and if so the stored result. * Return the answer in is_dup (when rc == 0). Further when is_dup is true, assign value into valp. + * Common function called by pool and container service RPC op lookup functions, */ -static int -pool_op_lookup(struct rdb_tx *tx, struct pool_svc *svc, crt_rpc_t *rpc, int pool_proto_ver, - bool *is_dup, struct ds_pool_svc_op_val *valp) +int +ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t *cli_uuidp, + uint64_t cli_time, bool *is_dup, struct ds_pool_svc_op_val *valp) { - struct pool_op_v6_in *in6 = crt_req_get(rpc); + struct pool_svc *svc = pool_svc; + bool need_put_svc = false; struct ds_pool_svc_op_key op_key; + d_iov_t op_key_enc = {.iov_buf = NULL}; struct ds_pool_svc_op_val op_val; - d_iov_t key; d_iov_t val; - uint32_t svc_ops_enabled; - bool proto_enabled; - bool dup = false; - crt_opcode_t opc = opc_get(rpc->cr_opc); + bool duplicate = false; int rc = 0; - /* If client didn't provide a key (old protocol), skip */ - proto_enabled = (pool_proto_ver >= POOL_PROTO_VER_WITH_SVC_OP_KEY); - if (!proto_enabled) - goto out; + if (!svc) { + rc = pool_svc_lookup(pool_uuid, &svc); + if (rc != 0) { + DL_ERROR(rc, "pool_svc lookup failed"); + goto out; + } + need_put_svc = true; + } - /* If the operation is not a write, skip (read-only ops not tracked for duplicates) */ - if (!pool_op_is_write(opc)) - goto out; + if (!svc->ps_ops_enabled) + goto out_svc; - /* If enabled, lookup client-provided op key, assign dup_op accordingly. */ - /* TODO: lookup from a cached value in struct pool_svc rather than rdb */ - d_iov_set(&val, &svc_ops_enabled, sizeof(svc_ops_enabled)); - rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_svc_ops_enabled, &val); - if (rc == -DER_NONEXIST) { - rc = 0; - goto out; - } else if (rc != 0) { - DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_enabled", in6->pi_uuid); - goto out; +#if 0 + /* DEBUG */ + rc = rdb_tx_iterate(tx, &svc->ps_ops, false /* backward */, pool_op_iter_cb, + NULL /* arg */); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to iterate ps_ops KVS", DP_UUID(pool_uuid)); + goto out_svc; } - if (!svc_ops_enabled) - goto out; +#endif - uuid_copy(op_key.ok_client_id, in6->pi_cli_id); - op_key.ok_client_time = in6->pi_time; - d_iov_set(&key, &op_key, sizeof(op_key)); + /* Construct (encoded) client ID key, look for it (duplicate RPC) in ps_ops */ d_iov_set(&val, &op_val, sizeof(op_val)); - - rc = rdb_tx_lookup(tx, &svc->ps_ops, &key, &val); + uuid_copy(op_key.ok_client_id, *cli_uuidp); + op_key.ok_client_time = cli_time; + rc = ds_pool_svc_op_key_encode(&op_key, &op_key_enc); + if (rc != 0) + goto out_svc; + rc = rdb_tx_lookup(tx, &svc->ps_ops, &op_key_enc, &val); if (rc == 0) { /* found - this is a retry/duplicate RPC being handled */ D_DEBUG(DB_MD, - DF_UUID ": retry RPC detected client=" DF_UUID " time=" DF_X64 - " op_rc=%d\n", - DP_UUID(in6->pi_uuid), DP_UUID(in6->pi_cli_id), in6->pi_time, op_val.ov_rc); - dup = true; + DF_UUID ": retry RPC detected client=" DF_UUID " time=%016lx op_rc=%d\n", + DP_UUID(pool_uuid), DP_UUID(*cli_uuidp), cli_time, op_val.ov_rc); + duplicate = true; } else if (rc == -DER_NONEXIST) { /* not found - new, unique RPC being handled */ rc = 0; } else { - DL_ERROR(rc, DF_UUID ": failed to lookup RPC client=" DF_UUID " time=" DF_X64, - DP_UUID(in6->pi_uuid), DP_UUID(in6->pi_cli_id), in6->pi_time); - goto out; + DL_ERROR(rc, DF_UUID ": failed to lookup RPC client=" DF_UUID " time=%016lx", + DP_UUID(pool_uuid), DP_UUID(*cli_uuidp), cli_time); + goto out_enc; } +out_enc: + D_FREE(op_key_enc.iov_buf); +out_svc: + if (need_put_svc) + pool_svc_put(svc); out: if (rc == 0) { - *is_dup = dup; - if (dup) + *is_dup = duplicate; + if (duplicate) *valp = op_val; } return rc; } -/* Save results of the (new, not duplicate) operation in svc_ops KVS. */ +/* Check if this is a duplicate/retry operation that was already done, and if so the stored result. + * Return the answer in is_dup (when rc == 0). Further when is_dup is true, assign value into valp. + */ static int -pool_op_save(struct rdb_tx *tx, struct pool_svc *svc, crt_rpc_t *rpc, int pool_proto_ver, int rc_in, - struct ds_pool_svc_op_val *op_valp) +pool_op_lookup(struct rdb_tx *tx, struct pool_svc *svc, crt_rpc_t *rpc, int pool_proto_ver, + bool *is_dup, struct ds_pool_svc_op_val *valp) { - struct pool_op_v6_in *in6 = crt_req_get(rpc); - d_iov_t key; - d_iov_t val; - struct ds_pool_svc_op_key op_key; - uint32_t svc_ops_enabled; - bool proto_enabled; - crt_opcode_t opc = opc_get(rpc->cr_opc); - int rc = 0; - - op_valp->ov_rc = rc_in; + struct pool_op_v6_in *in6 = crt_req_get(rpc); + crt_opcode_t opc = opc_get(rpc->cr_opc); + int rc = 0; /* If client didn't provide a key (old protocol), skip */ - proto_enabled = (pool_proto_ver >= POOL_PROTO_VER_WITH_SVC_OP_KEY); - if (!proto_enabled) + if (pool_proto_ver < POOL_PROTO_VER_WITH_SVC_OP_KEY) goto out; /* If the operation is not a write, skip (read-only ops not tracked for duplicates) */ if (!pool_op_is_write(opc)) goto out; - /* If enabled, save client-provided op key and result of the operation. */ - d_iov_set(&val, &svc_ops_enabled, sizeof(svc_ops_enabled)); - rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_svc_ops_enabled, &val); - if (rc == -DER_NONEXIST) { - rc = 0; - goto out; - } else if (rc != 0) { - DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_enabled", DP_UUID(in6->pi_uuid)); - goto out; + rc = ds_pool_svc_ops_lookup(tx, svc, svc->ps_uuid, &in6->pi_cli_id, in6->pi_time, is_dup, + valp); + +out: + return rc; +} + +int +ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t *cli_uuidp, + uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp) +{ + struct pool_svc *svc = pool_svc; + bool need_put_svc = false; + d_iov_t val; + struct ds_pool_svc_op_key op_key; + d_iov_t op_key_enc = {.iov_buf = NULL}; + uint32_t svc_ops_num; + uint32_t new_svc_ops_num; + int rc = 0; + + if (!svc) { + rc = pool_svc_lookup(pool_uuid, &svc); + if (rc != 0) { + DL_ERROR(rc, "pool_svc lookup failed"); + goto out; + } + need_put_svc = true; } - if (!svc_ops_enabled) - goto out; - /* TODO: implement mechanism to constrain rdb space usage by this KVS. */ - goto out; + if (!svc->ps_ops_enabled) + goto out_svc; + + /* Get number of entries in the KVS for incrementing/decrementing as applicable below */ + d_iov_set(&val, &svc_ops_num, sizeof(svc_ops_num)); + rc = rdb_tx_lookup(tx, &svc->ps_root, &ds_pool_prop_svc_ops_num, &val); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to lookup svc_ops_num", DP_UUID(pool_uuid)); + goto out_svc; + } + new_svc_ops_num = svc_ops_num; - /* Save result in ps_ops KVS, only if the return code is "definitive" (not retryable). */ - if (!daos_rpc_retryable_rc(op_valp->ov_rc)) { + if (!dup_op && !daos_rpc_retryable_rc(op_valp->ov_rc)) { /* If the write operation failed, discard its (unwanted) updates first. */ if (op_valp->ov_rc != 0) rdb_tx_discard(tx); - uuid_copy(op_key.ok_client_id, in6->pi_cli_id); - op_key.ok_client_time = in6->pi_time; - d_iov_set(&key, &op_key, sizeof(op_key)); + /* Construct (encoded) client ID key, insert an entry into ps_ops */ d_iov_set(&val, op_valp, sizeof(*op_valp)); - - rc = rdb_tx_lookup(tx, &svc->ps_ops, &key, &val); - if (rc != -DER_NONEXIST) { - D_ASSERT(rc != 0); - goto out; + uuid_copy(op_key.ok_client_id, *cli_uuidp); + op_key.ok_client_time = cli_time; + rc = ds_pool_svc_op_key_encode(&op_key, &op_key_enc); + if (rc != 0) + goto out_svc; + rc = rdb_tx_update(tx, &svc->ps_ops, &op_key_enc, &val); + if (rc != 0) { + DL_ERROR(rc, + DF_UUID ": svc_ops update failed: client=" DF_UUID " time=%016lx", + DP_UUID(pool_uuid), DP_UUID(*cli_uuidp), cli_time); + goto out_enc; } + new_svc_ops_num++; + } + + rc = pool_op_check_delete_oldest(tx, svc, dup_op, &new_svc_ops_num); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed pool_op_check_delete_oldest()", DP_UUID(pool_uuid)); + goto out_enc; + } - rc = rdb_tx_update(tx, &svc->ps_ops, &key, &val); + /* update the number of entries in the KVS */ + if (new_svc_ops_num != svc_ops_num) { + svc_ops_num = new_svc_ops_num; + d_iov_set(&val, &svc_ops_num, sizeof(svc_ops_num)); + rc = rdb_tx_update(tx, &svc->ps_root, &ds_pool_prop_svc_ops_num, &val); if (rc != 0) { - DL_ERROR(rc, - DF_UUID ": failed to update svc_ops client=" DF_UUID - " time=" DF_X64, - DP_UUID(in6->pi_uuid), DP_UUID(in6->pi_cli_id), in6->pi_time); - goto out; + DL_ERROR(rc, DF_UUID ": failed to update svc_ops_num", DP_UUID(pool_uuid)); + goto out_enc; } } +out_enc: + D_FREE(op_key_enc.iov_buf); +out_svc: + if (need_put_svc) + pool_svc_put(svc); +out: + return rc; +} + +/* Save results of the (new, not duplicate) operation in svc_ops KVS, if applicable. + * And delete oldest entry if KVS has reached maximum number, or oldest exceeds age limit. + */ +static int +pool_op_save(struct rdb_tx *tx, struct pool_svc *svc, crt_rpc_t *rpc, int pool_proto_ver, + bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp) +{ + struct pool_op_v6_in *in6 = crt_req_get(rpc); + crt_opcode_t opc = opc_get(rpc->cr_opc); + int rc = 0; + + if (!dup_op) + op_valp->ov_rc = rc_in; + + /* If client didn't provide a key (old protocol), skip */ + if (pool_proto_ver < POOL_PROTO_VER_WITH_SVC_OP_KEY) + goto out; + + /* If the operation is not a write, skip (read-only ops not tracked for duplicates) */ + if (!pool_op_is_write(opc)) + goto out; + + rc = ds_pool_svc_ops_save(tx, svc, svc->ps_uuid, &in6->pi_cli_id, in6->pi_time, dup_op, + rc_in, op_valp); out: return rc; @@ -3071,10 +3344,15 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) bool transfer_map = false; bool fi_pass_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_PASS_NOREPLY); bool fi_fail_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_FAIL_NOREPLY); + bool fi_pass_nl_noreply; + bool fi_fail_nl_noreply; D_DEBUG(DB_MD, DF_UUID ": processing rpc: %p hdl=" DF_UUID "\n", DP_UUID(in->pci_op.pi_uuid), rpc, DP_UUID(in->pci_op.pi_hdl)); + fi_pass_nl_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_PASS_NOREPLY_NEWLDR); + fi_fail_nl_noreply = DAOS_FAIL_CHECK(DAOS_MD_OP_FAIL_NOREPLY_NEWLDR); + rc = pool_svc_lookup_leader(in->pci_op.pi_uuid, &svc, &out->pco_op.po_hint); if (rc != 0) @@ -3100,7 +3378,7 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) goto out_lock; else if (dup_op) skip_update = true; - if (fi_fail_noreply) + if (fi_fail_noreply || fi_fail_nl_noreply) goto out_map_version; /* Check if pool is being destroyed and not accepting connections */ @@ -3321,11 +3599,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) out_map_version: out->pco_op.po_map_version = ds_pool_get_version(svc->ps_pool); + D_DEBUG(DB_MD, DF_UUID ": rc=%d, dup_op=%d\n", DP_UUID(in->pci_op.pi_uuid), rc, dup_op); /* If meets criteria (not dup, write op, definitive rc, etc.), store result in ps_ops KVS */ - if ((rc == 0) && !dup_op && fi_fail_noreply) + if ((rc == 0) && !dup_op && (fi_fail_noreply || fi_fail_nl_noreply)) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, handler_version, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, handler_version, dup_op, rc, &op_val); if (rc != 0) goto out_lock; rc = rdb_tx_commit(&tx); @@ -3371,6 +3649,18 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) D_DEBUG(DB_MD, DF_UUID ": fault injected: DAOS_MD_OP_FAIL_NOREPLY\n", DP_UUID(in->pci_op.pi_uuid)); } + if ((rc == 0) && !dup_op && fi_pass_nl_noreply) { + rc = -DER_TIMEDOUT; + D_DEBUG(DB_MD, DF_UUID ": fault injected: DAOS_MD_OP_PASS_NOREPLY_NEWLDR\n", + DP_UUID(in->pci_op.pi_uuid)); + rdb_resign(svc->ps_rsvc.s_db, svc->ps_rsvc.s_term); + } + if ((rc == -DER_MISC) && !dup_op && fi_fail_nl_noreply) { + rc = -DER_TIMEDOUT; + D_DEBUG(DB_MD, DF_UUID ": fault injected: DAOS_MD_OP_FAIL_NOREPLY_NEWLDR\n", + DP_UUID(in->pci_op.pi_uuid)); + rdb_resign(svc->ps_rsvc.s_db, svc->ps_rsvc.s_term); + } out->pco_op.po_rc = rc; D_DEBUG(DB_MD, DF_UUID ": replying rpc: %p " DF_RC "\n", DP_UUID(in->pci_op.pi_uuid), rpc, @@ -3551,8 +3841,7 @@ ds_pool_disconnect_handler(crt_rpc_t *rpc, int handler_version) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, handler_version, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, handler_version, dup_op, rc, &op_val); if (rc != 0) goto out_lock; rc = rdb_tx_commit(&tx); @@ -4237,6 +4526,8 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) case DAOS_PROP_PO_CHECKPOINT_FREQ: case DAOS_PROP_PO_CHECKPOINT_THRESH: case DAOS_PROP_PO_REINT_MODE: + case DAOS_PROP_PO_SVC_OPS_ENABLED: + case DAOS_PROP_PO_SVC_OPS_ENTRY_AGE: if (entry->dpe_val != iv_entry->dpe_val) { D_ERROR("type %d mismatch "DF_U64" - " DF_U64".\n", entry->dpe_type, @@ -4296,8 +4587,7 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE]; /* See comment above, rebuild doesn't connect the pool */ - if ((query_bits & DAOS_PO_QUERY_SPACE) && - !is_pool_from_srv(in->pqi_op.pi_uuid, in->pqi_op.pi_hdl)) { + if (query_bits & DAOS_PO_QUERY_SPACE) { rc = pool_space_query_bcast(rpc->cr_ctx, svc, in->pqi_op.pi_hdl, &out->pqo_space); if (unlikely(rc)) @@ -4898,8 +5188,7 @@ ds_pool_prop_set_handler(crt_rpc_t *rpc) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, dup_op, rc, &op_val); if (rc != 0) goto out_lock; @@ -5258,7 +5547,7 @@ pool_upgrade_props(struct rdb_tx *tx, struct pool_svc *svc, struct rdb_kvs_attr attr; D_DEBUG(DB_MD, DF_UUID ": creating service ops KVS\n", DP_UUID(pool_uuid)); - attr.dsa_class = RDB_KVS_GENERIC; + attr.dsa_class = RDB_KVS_LEXICAL; attr.dsa_order = 16; rc = rdb_tx_create_kvs(tx, &svc->ps_root, &ds_pool_prop_svc_ops, &attr); if (rc != 0) { @@ -5697,6 +5986,16 @@ ds_pool_svc_set_prop(uuid_t pool_uuid, d_rank_list_t *ranks, daos_prop_t *prop) D_GOTO(out, rc = -DER_NO_PERM); } + if (daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENABLED)) { + D_ERROR("Can't set pool svc_ops_enabled on existing pool.\n"); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_OPS_ENTRY_AGE)) { + D_ERROR("Can't set pool svc_ops_entry_age on existing pool.\n"); + D_GOTO(out, rc = -DER_NO_PERM); + } + /* Disallow to begin with; will support in the future. */ if (daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_REDUN_FAC)) { D_ERROR(DF_UUID ": cannot set pool service redundancy factor on existing pool\n", @@ -5858,8 +6157,7 @@ ds_pool_acl_update_handler(crt_rpc_t *rpc) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, dup_op, rc, &op_val); if (rc != 0) goto out_lock; @@ -6035,8 +6333,7 @@ ds_pool_acl_delete_handler(crt_rpc_t *rpc) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, dup_op, rc, &op_val); if (rc != 0) goto out_lock; @@ -6865,12 +7162,14 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, D_GOTO(out, rc); } - env = getenv(REBUILD_ENV); + d_agetenv_str(&env, REBUILD_ENV); if ((env && !strcasecmp(env, REBUILD_ENV_DISABLED)) || daos_fail_check(DAOS_REBUILD_DISABLE)) { D_DEBUG(DB_TRACE, "Rebuild is disabled\n"); + d_freeenv_str(&env); D_GOTO(out, rc = 0); } + d_freeenv_str(&env); rc = ds_pool_iv_prop_fetch(svc->ps_pool, &prop); if (rc) @@ -7392,8 +7691,7 @@ ds_pool_evict_handler(crt_rpc_t *rpc) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, DAOS_POOL_VERSION, dup_op, rc, &op_val); if (rc != 0) goto out_lock; rc = rdb_tx_commit(&tx); @@ -7820,8 +8118,7 @@ ds_pool_attr_set_handler(crt_rpc_t *rpc, int handler_version) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, handler_version, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, handler_version, dup_op, rc, &op_val); if (rc != 0) goto out_lock; @@ -7909,8 +8206,7 @@ ds_pool_attr_del_handler(crt_rpc_t *rpc, int handler_version) out_commit: if ((rc == 0) && !dup_op && fi_fail_noreply) rc = -DER_MISC; - if (!dup_op) - rc = pool_op_save(&tx, svc, rpc, handler_version, rc, &op_val); + rc = pool_op_save(&tx, svc, rpc, handler_version, dup_op, rc, &op_val); if (rc != 0) goto out_lock; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 8dce5ae86618..7fbff30e061d 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -637,6 +637,16 @@ pool_obj(struct daos_llink *llink) return container_of(llink, struct ds_pool, sp_entry); } +static inline void +pool_put_sync(void *args) +{ + struct ds_pool *pool = args; + + D_ASSERT(pool != NULL); + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + daos_lru_ref_release(pool_cache, &pool->sp_entry); +} + struct ds_pool_create_arg { uint32_t pca_map_version; }; @@ -857,7 +867,7 @@ ds_pool_lookup(const uuid_t uuid, struct ds_pool **pool) *pool = pool_obj(llink); if ((*pool)->sp_stopping) { D_DEBUG(DB_MD, DF_UUID": is in stopping\n", DP_UUID(uuid)); - ds_pool_put(*pool); + pool_put_sync(*pool); *pool = NULL; return -DER_SHUTDOWN; } @@ -876,9 +886,31 @@ ds_pool_get(struct ds_pool *pool) void ds_pool_put(struct ds_pool *pool) { - D_ASSERT(pool != NULL); - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - daos_lru_ref_release(pool_cache, &pool->sp_entry); + int rc; + + /* + * Someone has stopped the pool. Current user may be the one that is holding the last + * reference on the pool, then drop such reference will trigger pool_free_ref() as to + * stop related container that may wait current user (ULT) to exit. To avoid deadlock, + * let's use independent ULT to drop the reference asynchronously and make current ULT + * to go ahead. + * + * An example of the deadlock scenarios is something like that: + * + * cont_iv_prop_fetch_ult => ds_pool_put => pool_free_ref [WAIT]=> cont_child_stop => + * cont_stop_agg [WAIT]=> cont_agg_ult => ds_cont_csummer_init => ds_cont_get_props => + * cont_iv_prop_fetch [WAIT]=> cont_iv_prop_fetch_ult + */ + if (unlikely(pool->sp_stopping) && daos_lru_is_last_user(&pool->sp_entry)) { + rc = dss_ult_create(pool_put_sync, pool, DSS_XS_SELF, 0, 0, NULL); + if (unlikely(rc != 0)) { + D_ERROR("Failed to create ULT to async put ref on the pool "DF_UUID"\n", + DP_UUID(pool->sp_uuid)); + pool_put_sync(pool); + } + } else { + pool_put_sync(pool); + } } void @@ -1062,7 +1094,7 @@ ds_pool_start(uuid_t uuid) failure_ult: pool_fetch_hdls_ult_abort(pool); failure_pool: - ds_pool_put(pool); + pool_put_sync(pool); return rc; } @@ -1090,7 +1122,7 @@ ds_pool_stop(uuid_t uuid) ds_rebuild_abort(pool->sp_uuid, -1, -1, -1); ds_migrate_stop(pool, -1, -1); ds_pool_put(pool); /* held by ds_pool_start */ - ds_pool_put(pool); + pool_put_sync(pool); D_INFO(DF_UUID": pool stopped\n", DP_UUID(uuid)); } diff --git a/src/proto/ctl/smd.proto b/src/proto/ctl/smd.proto index 18b90798f410..e5494b73e4e8 100644 --- a/src/proto/ctl/smd.proto +++ b/src/proto/ctl/smd.proto @@ -84,11 +84,11 @@ enum NvmeDevState { } enum LedState { - OFF = 0; // Equivalent to SPDK_VMD_LED_STATE_OFF + NA = 0; // Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) QUICK_BLINK = 1; // Equivalent to SPDK_VMD_LED_STATE_IDENTIFY (4Hz blink) ON = 2; // Equivalent to SPDK_VMD_LED_STATE_FAULT (solid on) SLOW_BLINK = 3; // Equivalent to SPDK_VMD_LED_STATE_REBUILD (1Hz blink) - NA = 4; // Equivalent to SPDK_VMD_LED_STATE_UNKNOWN (VMD not enabled) + OFF = 4; // Equivalent to SPDK_VMD_LED_STATE_OFF } // NvmeController represents an NVMe Controller (SSD). diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto index edafa4e42bad..944d8e943bae 100644 --- a/src/proto/ctl/storage_nvme.proto +++ b/src/proto/ctl/storage_nvme.proto @@ -19,6 +19,7 @@ import "ctl/smd.proto"; message NvmeControllerResult { string pci_addr = 1; // PCI address of NVMe controller ResponseState state = 2; // state of current operation + uint32 role_bits = 3; // Device active roles (bitmask) } message ScanNvmeReq { diff --git a/src/rdb/rdb.c b/src/rdb/rdb.c index f5c3eb476295..a1d39c507ef7 100644 --- a/src/rdb/rdb.c +++ b/src/rdb/rdb.c @@ -684,7 +684,7 @@ rdb_resign(struct rdb *db, uint64_t term) * * \param[in] db database * - * \retval -DER_INVAL not a voting replica + * \retval -DER_NO_PERM not a voting replica or might violate a lease */ int rdb_campaign(struct rdb *db) diff --git a/src/rdb/rdb_layout.h b/src/rdb/rdb_layout.h index 140e9e146388..66fb9b5788c5 100644 --- a/src/rdb/rdb_layout.h +++ b/src/rdb/rdb_layout.h @@ -80,15 +80,16 @@ /* * Object ID * - * The highest bit represents the object ID class. The remaining 63 bits + * The highest 2 bits represent the object ID class. The remaining 62 bits * represent the object number, which must be nonzero. */ typedef uint64_t rdb_oid_t; /* Object ID class (see rdb_oid_t) */ -#define RDB_OID_CLASS_MASK (1ULL << 63) -#define RDB_OID_CLASS_GENERIC (0ULL << 63) -#define RDB_OID_CLASS_INTEGER (1ULL << 63) +#define RDB_OID_CLASS_MASK (3ULL << 62) +#define RDB_OID_CLASS_GENERIC (0ULL << 62) +#define RDB_OID_CLASS_INTEGER (2ULL << 62) +#define RDB_OID_CLASS_LEXICAL (1ULL << 62) /* D-key for all a-keys */ extern d_iov_t rdb_dkey; diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index 8be8dc6ed991..886a873729c0 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -61,6 +61,7 @@ rdb_raft_rc(int raft_rc) case RAFT_ERR_NOMEM: return -DER_NOMEM; case RAFT_ERR_SNAPSHOT_ALREADY_LOADED: return -DER_ALREADY; case RAFT_ERR_INVALID_CFG_CHANGE: return -DER_INVAL; + case RAFT_ERR_MIGHT_VIOLATE_LEASE: return -DER_NO_PERM; default: return -DER_MISC; } } @@ -2854,7 +2855,7 @@ rdb_raft_campaign(struct rdb *db) node = raft_get_my_node(db->d_raft); if (node == NULL || !raft_node_is_voting(node)) { D_DEBUG(DB_MD, DF_DB": must be voting node\n", DP_DB(db)); - rc = -DER_INVAL; + rc = -DER_NO_PERM; goto out_mutex; } diff --git a/src/rdb/rdb_tx.c b/src/rdb/rdb_tx.c index 2d67e58042c5..ed4adb0c0d6f 100644 --- a/src/rdb/rdb_tx.c +++ b/src/rdb/rdb_tx.c @@ -678,6 +678,9 @@ rdb_oid_class(enum rdb_kvs_class class, rdb_oid_t *oid_class) case RDB_KVS_INTEGER: *oid_class = RDB_OID_CLASS_INTEGER; return 0; + case RDB_KVS_LEXICAL: + *oid_class = RDB_OID_CLASS_LEXICAL; + return 0; default: return -DER_IO; } diff --git a/src/rdb/rdb_util.c b/src/rdb/rdb_util.c index dc34f093e2a4..f53ee56de5ff 100644 --- a/src/rdb/rdb_util.c +++ b/src/rdb/rdb_util.c @@ -162,16 +162,26 @@ rdb_decode_iov_backward(const void *buf_end, size_t len, d_iov_t *iov) void rdb_oid_to_uoid(rdb_oid_t oid, daos_unit_oid_t *uoid) { - enum daos_otype_t type = DAOS_OT_MULTI_HASHED; + enum daos_otype_t type; uoid->id_pub.lo = oid & ~RDB_OID_CLASS_MASK; uoid->id_pub.hi = 0; uoid->id_shard = 0; uoid->id_layout_ver = 0; uoid->id_padding = 0; - /* Since we don't really use d-keys, use HASHED for both classes. */ - if ((oid & RDB_OID_CLASS_MASK) != RDB_OID_CLASS_GENERIC) + switch (oid & RDB_OID_CLASS_MASK) { + case RDB_OID_CLASS_GENERIC: + type = DAOS_OT_MULTI_HASHED; + break; + case RDB_OID_CLASS_INTEGER: type = DAOS_OT_AKEY_UINT64; + break; + case RDB_OID_CLASS_LEXICAL: + type = DAOS_OT_MULTI_LEXICAL; + break; + default: + D_ASSERT(0); + } daos_obj_set_oid(&uoid->id_pub, type, OR_RP_1, 1, 0); } diff --git a/src/rsvc/srv.c b/src/rsvc/srv.c index 7e90aa64d2d4..97a03f1f013d 100644 --- a/src/rsvc/srv.c +++ b/src/rsvc/srv.c @@ -1389,10 +1389,11 @@ ds_rsvc_get_md_cap(void) char *v; int n; - v = getenv(DAOS_MD_CAP_ENV); /* in MB */ + d_agetenv_str(&v, DAOS_MD_CAP_ENV); /* in MB */ if (v == NULL) return size_default; n = atoi(v); + d_freeenv_str(&v); if ((n << 20) < MINIMUM_DAOS_MD_CAP_SIZE) { D_ERROR("metadata capacity too low; using %zu MB\n", size_default >> 20); diff --git a/src/tests/ftest/cart/iv_server.c b/src/tests/ftest/cart/iv_server.c index bdc549cee198..1e823972362b 100644 --- a/src/tests/ftest/cart/iv_server.c +++ b/src/tests/ftest/cart/iv_server.c @@ -1241,13 +1241,14 @@ int main(int argc, char **argv) return -1; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); if (env_self_rank == NULL) { printf("CRT_L_RANK was not set\n"); return -1; } my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); @@ -1274,7 +1275,7 @@ int main(int argc, char **argv) init_work_contexts(); /* Load the group configuration file */ - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + rc = d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); if (grp_cfg_file == NULL) { D_ERROR("CRT_L_GRP_CFG was not set\n"); assert(0); @@ -1288,6 +1289,7 @@ int main(int argc, char **argv) D_ERROR("Failed to load group file %s\n", grp_cfg_file); assert(0); } + d_freeenv_str(&grp_cfg_file); /* Start the server for myself */ DBG_PRINT("Server starting, self_rank=%d\n", my_rank); diff --git a/src/tests/ftest/cart/no_pmix_corpc_errors.c b/src/tests/ftest/cart/no_pmix_corpc_errors.c index be8886e6d22a..c69e4d900bdc 100644 --- a/src/tests/ftest/cart/no_pmix_corpc_errors.c +++ b/src/tests/ftest/cart/no_pmix_corpc_errors.c @@ -271,8 +271,9 @@ int main(int argc, char **argv) crtu_set_shutdown_delay(2); } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); @@ -326,7 +327,7 @@ int main(int argc, char **argv) } } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); rc = crt_rank_self_set(my_rank, 1 /* group_version_min */); if (rc != 0) { @@ -351,6 +352,7 @@ int main(int argc, char **argv) DBG_PRINT("self_rank=%d uri=%s grp_cfg_file=%s\n", my_rank, my_uri, grp_cfg_file); + d_freeenv_str(&grp_cfg_file); D_FREE(my_uri); rc = crt_group_size(NULL, &grp_size); diff --git a/src/tests/ftest/cart/no_pmix_group_test.c b/src/tests/ftest/cart/no_pmix_group_test.c index ecc6881dc553..7290b478aa53 100644 --- a/src/tests/ftest/cart/no_pmix_group_test.c +++ b/src/tests/ftest/cart/no_pmix_group_test.c @@ -318,8 +318,9 @@ int main(int argc, char **argv) int num_attach_retries = 20; uint32_t primary_grp_version = 1; - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* When under valgrind bump expected timeouts to 60 seconds */ if (D_ON_VALGRIND) { @@ -382,7 +383,7 @@ int main(int argc, char **argv) } } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); rc = crt_rank_self_set(my_rank, primary_grp_version); if (rc != 0) { @@ -407,6 +408,7 @@ int main(int argc, char **argv) DBG_PRINT("self_rank=%d uri=%s grp_cfg_file=%s\n", my_rank, my_uri, grp_cfg_file); + d_freeenv_str(&grp_cfg_file); D_FREE(my_uri); rc = crt_group_size(NULL, &grp_size); diff --git a/src/tests/ftest/cart/no_pmix_group_version.c b/src/tests/ftest/cart/no_pmix_group_version.c index 0f6aa9aedcc1..f3fab3dce0ca 100644 --- a/src/tests/ftest/cart/no_pmix_group_version.c +++ b/src/tests/ftest/cart/no_pmix_group_version.c @@ -268,8 +268,9 @@ int main(int argc, char **argv) int rc; int num_attach_retries = 20; - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* When under valgrind bump expected timeouts to 60 seconds */ if (D_ON_VALGRIND) { @@ -326,7 +327,7 @@ int main(int argc, char **argv) } } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); rc = crt_rank_self_set(my_rank, 1 /* group_version_min */); if (rc != 0) { @@ -351,6 +352,7 @@ int main(int argc, char **argv) DBG_PRINT("self_rank=%d uri=%s grp_cfg_file=%s\n", my_rank, my_uri, grp_cfg_file); + d_freeenv_str(&grp_cfg_file); D_FREE(my_uri); rc = crt_group_size(NULL, &grp_size); diff --git a/src/tests/ftest/cart/no_pmix_launcher_client.c b/src/tests/ftest/cart/no_pmix_launcher_client.c index 4c913226f77c..24999ee51c3c 100644 --- a/src/tests/ftest/cart/no_pmix_launcher_client.c +++ b/src/tests/ftest/cart/no_pmix_launcher_client.c @@ -107,7 +107,7 @@ int main(int argc, char **argv) progress_function, &crt_ctx); assert(rc == 0); - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); DBG_PRINT("Client starting with cfg_file=%s\n", grp_cfg_file); /* load group info from a config file and delete file upon return */ @@ -116,6 +116,7 @@ int main(int argc, char **argv) D_ERROR("crtu_load_group_from_file() failed; rc=%d\n", rc); assert(0); } + d_freeenv_str(&grp_cfg_file); rc = crt_group_size(grp, &grp_size); if (rc != 0) { diff --git a/src/tests/ftest/cart/no_pmix_launcher_server.c b/src/tests/ftest/cart/no_pmix_launcher_server.c index eda6ac14a05c..e19ce0810cc5 100644 --- a/src/tests/ftest/cart/no_pmix_launcher_server.c +++ b/src/tests/ftest/cart/no_pmix_launcher_server.c @@ -33,8 +33,9 @@ int main(int argc, char **argv) uint32_t grp_size; int rc; - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); @@ -83,7 +84,7 @@ int main(int argc, char **argv) } } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); if (grp_cfg_file == NULL) { D_ERROR("CRT_L_GRP_CFG was not set\n"); assert(0); @@ -105,6 +106,7 @@ int main(int argc, char **argv) DBG_PRINT("self_rank=%d uri=%s grp_cfg_file=%s\n", my_rank, my_uri, grp_cfg_file); + d_freeenv_str(&grp_cfg_file); D_FREE(my_uri); rc = crt_group_size(NULL, &grp_size); diff --git a/src/tests/ftest/cart/test_corpc_exclusive.c b/src/tests/ftest/cart/test_corpc_exclusive.c index 5f563d4d4b05..b3d81d857f7e 100644 --- a/src/tests/ftest/cart/test_corpc_exclusive.c +++ b/src/tests/ftest/cart/test_corpc_exclusive.c @@ -103,8 +103,9 @@ int main(void) membs.rl_nr = 3; membs.rl_ranks = memb_ranks; - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); @@ -128,7 +129,7 @@ int main(void) assert(0); } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); rc = crt_rank_self_set(my_rank, 1 /* group_version_min */); if (rc != 0) { @@ -146,6 +147,7 @@ int main(void) /* load group info from a config file and delete file upon return */ rc = crtu_load_group_from_file(grp_cfg_file, g_main_ctx, grp, my_rank, true); + d_freeenv_str(&grp_cfg_file); if (rc != 0) { D_ERROR("crtu_load_group_from_file() failed; rc=%d\n", rc); assert(0); diff --git a/src/tests/ftest/cart/test_corpc_prefwd.c b/src/tests/ftest/cart/test_corpc_prefwd.c index 92f43ed5fcd6..8aa9480476e9 100644 --- a/src/tests/ftest/cart/test_corpc_prefwd.c +++ b/src/tests/ftest/cart/test_corpc_prefwd.c @@ -125,8 +125,9 @@ int main(void) excluded_membs.rl_nr = 1; excluded_membs.rl_ranks = &excluded_ranks; - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); @@ -150,7 +151,7 @@ int main(void) assert(0); } - grp_cfg_file = getenv("CRT_L_GRP_CFG"); + d_agetenv_str(&grp_cfg_file, "CRT_L_GRP_CFG"); rc = crt_rank_self_set(my_rank, 1 /* group_version_min */); if (rc != 0) { @@ -168,6 +169,7 @@ int main(void) /* load group info from a config file and delete file upon return */ rc = crtu_load_group_from_file(grp_cfg_file, g_main_ctx, grp, my_rank, true); + d_freeenv_str(&grp_cfg_file); if (rc != 0) { D_ERROR("crtu_load_group_from_file() failed; rc=%d\n", rc); assert(0); diff --git a/src/tests/ftest/cart/test_ep_cred_server.c b/src/tests/ftest/cart/test_ep_cred_server.c index 9f747aef6c2a..29e66ae960d6 100644 --- a/src/tests/ftest/cart/test_ep_cred_server.c +++ b/src/tests/ftest/cart/test_ep_cred_server.c @@ -73,8 +73,9 @@ main(int argc, char **argv) return rc; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 40, true, true); diff --git a/src/tests/ftest/cart/test_group_np_srv.c b/src/tests/ftest/cart/test_group_np_srv.c index e950b03a9982..1353be76983b 100644 --- a/src/tests/ftest/cart/test_group_np_srv.c +++ b/src/tests/ftest/cart/test_group_np_srv.c @@ -151,8 +151,9 @@ int main(int argc, char **argv) return rc; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); diff --git a/src/tests/ftest/cart/test_multisend_server.c b/src/tests/ftest/cart/test_multisend_server.c index a0b478d63cbc..ee770ec9b4e0 100644 --- a/src/tests/ftest/cart/test_multisend_server.c +++ b/src/tests/ftest/cart/test_multisend_server.c @@ -167,8 +167,9 @@ main(int argc, char **argv) return rc; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 40, true, true); diff --git a/src/tests/ftest/cart/test_proto_server.c b/src/tests/ftest/cart/test_proto_server.c index 5f3470c15d70..5fb587b94df9 100644 --- a/src/tests/ftest/cart/test_proto_server.c +++ b/src/tests/ftest/cart/test_proto_server.c @@ -83,8 +83,9 @@ main(int argc, char **argv) return rc; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 40, true, true); diff --git a/src/tests/ftest/cart/test_rpc_to_ghost_rank.c b/src/tests/ftest/cart/test_rpc_to_ghost_rank.c index e3a6cf950832..0e7741d364b5 100644 --- a/src/tests/ftest/cart/test_rpc_to_ghost_rank.c +++ b/src/tests/ftest/cart/test_rpc_to_ghost_rank.c @@ -507,8 +507,10 @@ int main(int argc, char **argv) return rc; } - env_self_rank = getenv("CRT_L_RANK"); + d_agetenv_str(&env_self_rank, "CRT_L_RANK"); my_rank = atoi(env_self_rank); + d_freeenv_str(&env_self_rank); + /* rank, num_attach_retries, is_server, assert_on_error */ crtu_test_init(my_rank, 20, true, true); diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index 38950af94a86..a512836a5ef0 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -62,17 +62,17 @@ def test_csum_error_logging(self): host_devices = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices') for host, devices in host_devices.items(): for device in devices: - for entry in ('uuid', 'tgt_ids', 'role_bits', 'roles'): + for entry in ('uuid', 'tgt_ids', 'role_bits'): if entry not in device: self.fail( 'Missing {} info from dmg storage query list devices'.format(entry)) self.log.info( - 'Host %s device: uuid=%s, targets=%s, role=%s, role_bits=%s', - host, device['uuid'], device['tgt_ids'], device['roles'], device['role_bits']) + 'Host %s device: uuid=%s, targets=%s, role_bits=%s', + host, device['uuid'], device['tgt_ids'], device['role_bits']) if not device['tgt_ids']: self.log_step('Skipping device without targets on {}'.format(device['uuid'])) continue - if device['roles'] and not int(device['role_bits']) & 1: + if (int(device['role_bits']) > 0) and not int(device['role_bits']) & 1: self.log_step( 'Skipping {} device without data on {}'.format( device['role_bits'], device['uuid'])) diff --git a/src/tests/ftest/container/rf_enforcement.py b/src/tests/ftest/container/rf_enforcement.py index dcacf0913983..aafb24bd8410 100644 --- a/src/tests/ftest/container/rf_enforcement.py +++ b/src/tests/ftest/container/rf_enforcement.py @@ -40,9 +40,7 @@ def test_container_redundancy_factor_oclass_enforcement(self): :avocado: tags=all,full_regression :avocado: tags=vm - :avocado: tags=container - :avocado: tags=container_rf,cont_rf_oclass_enforcement - :avocado: tags=test_container_redundancy_factor_oclass_enforcement + :avocado: tags=container,container_rf,cont_rf_oclass_enforcement + :avocado: tags=ContRfEnforce,test_container_redundancy_factor_oclass_enforcement """ - self.mode = "cont_rf_enforcement" - self.execute_cont_rf_test() + self.execute_cont_rf_test(mode="cont_rf_enforcement") diff --git a/src/tests/ftest/control/dmg_storage_query.py b/src/tests/ftest/control/dmg_storage_query.py index a9c8a99c143e..e3a2687dfd40 100644 --- a/src/tests/ftest/control/dmg_storage_query.py +++ b/src/tests/ftest/control/dmg_storage_query.py @@ -40,7 +40,7 @@ def get_bdev_info(self): for item, device in enumerate(sorted(tier.bdev_list.value)): bdev_info.append( {'bdev': device, - 'roles': ','.join(tier.bdev_roles.value or []), + 'roles': ','.join(tier.bdev_roles.value or ['NA']), 'tier': index, 'tgt_ids': list(range(item, targets, len(tier.bdev_list.value)))}) diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index 2af81e67427a..39d7e4359f54 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2021-2023 Intel Corporation. + (C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -93,6 +93,8 @@ def run_test(self, il_lib=None): daos_test_env['D_LOG_MASK'] = 'INFO,IL=DEBUG' command = [self.daos_test, '--test-dir', mount_dir, '--io', '--stream'] + if use_dfuse: + command.append('--lowfd') if cache_mode != 'writeback': command.append('--metadata') diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index 5d08fb4493d2..c1bc395c2b05 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -10,7 +10,7 @@ timeouts: test_daos_degraded_mode: 450 test_daos_management: 110 test_daos_pool: 180 - test_daos_container: 450 + test_daos_container: 510 test_daos_epoch: 125 test_daos_verify_consistency: 105 test_daos_io: 290 @@ -191,5 +191,3 @@ daos_tests: test_daos_extend_simple: 5 test_daos_rebuild_ec: 43 test_daos_degraded_ec: 29 - crt_timeout: - test_daos_oid_allocator: 60 diff --git a/src/tests/ftest/deployment/server_rank_failure.yaml b/src/tests/ftest/deployment/server_rank_failure.yaml index 5d422648eede..2f8974045981 100644 --- a/src/tests/ftest/deployment/server_rank_failure.yaml +++ b/src/tests/ftest/deployment/server_rank_failure.yaml @@ -36,12 +36,9 @@ server_config: pool_size_ratio_80: size: 80% - control_method: dmg rebuild_timeout: 960 - svcn: 5 pool_size_value: size: 500G - control_method: dmg rebuild_timeout: 240 container: diff --git a/src/tests/ftest/dfuse/bash_fd.py b/src/tests/ftest/dfuse/bash_fd.py new file mode 100644 index 000000000000..a8a39d61a517 --- /dev/null +++ b/src/tests/ftest/dfuse/bash_fd.py @@ -0,0 +1,160 @@ +""" + (C) Copyright 2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" + +import os +import stat + +from dfuse_test_base import DfuseTestBase +from run_utils import run_remote + +OUTER = """#!/bin/bash + +set -uex + +[ -d out_dir ] && rm -rf out_dir +[ -d e_out_dir ] && rm -rf e_out_dir +mkdir out_dir + +cd out_dir + +.././bash_fd_inner.sh + +cd - + +grep . out_dir/* + +mkdir e_out_dir +echo first file > e_out_dir/out_file +echo first file >> e_out_dir/out_file + +echo second file > e_out_dir/other_file +echo five >> e_out_dir/other_file +echo six >> e_out_dir/other_file + +diff --new-file --recursive out_dir e_out_dir +""" + +INNER = """#!/bin/bash + +set -ue + +echo Hello, about to perform some bash I/O testing similar to "configure" scripts. + +# Open file for read/write access. +exec 3<>out_file +echo first file >&3 + +ls -l /proc/$$/fd + +# Open file in /proc and hold it open. +exec 4other_file +echo second file >&5 + +# Duplicate fd. +exec 6>&5 +echo five >&6 + +ls -l /proc/$$/fd + +# Close fds as output file descriptors. +exec 4>&- +exec 5>&- +echo six >&6 +exec 6>&- + +echo first file >&3 +exec 3>&- + +ls -l /proc/$$/fd + +exit 0 +""" + + +class DFuseFdTest(DfuseTestBase): + """Base FdTest test class. + + :avocado: recursive + """ + + def run_bashfd(self, il_lib=None): + """Run a shell script which opens and writes to files. + + This attempts to replicate the way that configure scripts manipulate fds in bash. + + Args: + il_lib (str, optional): interception library to run with. Defaults to None + """ + + if il_lib is not None: + lib_path = os.path.join(self.prefix, "lib64", il_lib) + env_str = f"export LD_PRELOAD={lib_path}; " + else: + env_str = "" + + pool = self.get_pool(connect=False) + container = self.get_container(pool) + self.start_dfuse(self.hostlist_clients, pool, container) + + fuse_root_dir = self.dfuse.mount_dir.value + + with open(os.path.join(fuse_root_dir, "bash_fd_inner.sh"), "w") as fd: + fd.write(INNER) + + os.chmod(os.path.join(fuse_root_dir, "bash_fd_inner.sh"), stat.S_IXUSR | stat.S_IRUSR) + + with open(os.path.join(fuse_root_dir, "bash_fd_outer.sh"), "w") as fd: + fd.write(OUTER) + + os.chmod(os.path.join(fuse_root_dir, "bash_fd_outer.sh"), stat.S_IXUSR | stat.S_IRUSR) + + cmd = f"cd {fuse_root_dir}; ./bash_fd_outer.sh" + + result = run_remote(self.log, self.hostlist_clients, env_str + cmd) + if not result.passed: + self.fail(f'"{cmd}" failed on {result.failed_hosts}') + + def test_bashfd(self): + """ + + Test Description: + Test a typical I/O pattern for bash based configure scripts. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=dfuse,dfs + :avocado: tags=DFuseFdTest,test_bashfd + """ + self.run_bashfd() + + def test_bashfd_ioil(self): + """ + + Test Description: + Test a typical I/O pattern for bash based configure scripts. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=dfuse,il,dfs + :avocado: tags=DFuseFdTest,test_bashfd_ioil + """ + self.run_bashfd(il_lib="libioil.so") + + def test_bashfd_pil4dfs(self): + """ + + Test Description: + Test a typical I/O pattern for bash based configure scripts. + + :avocado: tags=all,full_regression + :avocado: tags=vm + :avocado: tags=pil4dfs,dfs + :avocado: tags=DFuseFdTest,test_bashfd_pil4dfs + """ + self.run_bashfd(il_lib="libpil4dfs.so") diff --git a/src/tests/ftest/dfuse/bash_fd.yaml b/src/tests/ftest/dfuse/bash_fd.yaml new file mode 100644 index 000000000000..cba38655bc18 --- /dev/null +++ b/src/tests/ftest/dfuse/bash_fd.yaml @@ -0,0 +1,20 @@ +hosts: + test_servers: 1 +timeout: 900 +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos + system_ram_reserved: 1 +pool: + scm_size: 1G +container: + type: POSIX + control_method: daos diff --git a/src/tests/ftest/dfuse/daos_build.py b/src/tests/ftest/dfuse/daos_build.py index 79289adc833c..04c368f64aeb 100644 --- a/src/tests/ftest/dfuse/daos_build.py +++ b/src/tests/ftest/dfuse/daos_build.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -26,7 +26,7 @@ def test_dfuse_daos_build_wb(self): Mount dfuse Checkout and build DAOS sources. - :avocado: tags=all,daily_regression + :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium :avocado: tags=daosio,dfuse,daos_cmd :avocado: tags=DaosBuild,test_dfuse_daos_build_wb @@ -58,9 +58,9 @@ def test_dfuse_daos_build_wt_il(self): Mount dfuse Checkout and build DAOS sources. - :avocado: tags=all,daily_regression + :avocado: tags=all,full_regression :avocado: tags=vm - :avocado: tags=daosio,dfuse + :avocado: tags=daosio,dfuse,il,dfs :avocado: tags=DaosBuild,test_dfuse_daos_build_wt_il """ self.run_build_test("writethrough", True, run_on_vms=True) @@ -106,7 +106,7 @@ def test_dfuse_daos_build_nocache(self): Mount dfuse Checkout and build DAOS sources. - :avocado: tags=all,daily_regression + :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=daosio,dfuse :avocado: tags=DaosBuild,test_dfuse_daos_build_nocache @@ -148,27 +148,27 @@ def run_build_test(self, cache_mode, intercept=False, run_on_vms=False): self.load_dfuse(self.hostlist_clients, dfuse_namespace) if cache_mode == 'writeback': - cont_attrs['dfuse-data-cache'] = '1d' + cont_attrs['dfuse-data-cache'] = '1m' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time elif cache_mode == 'writethrough': if intercept: build_time *= 6 - cont_attrs['dfuse-data-cache'] = '1d' + cont_attrs['dfuse-data-cache'] = '1m' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time self.dfuse.disable_wb_cache.value = True elif cache_mode == 'metadata': - cont_attrs['dfuse-data-cache'] = '1d' + cont_attrs['dfuse-data-cache'] = '1m' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time self.dfuse.disable_wb_cache.value = True elif cache_mode == 'data': build_time *= 2 - cont_attrs['dfuse-data-cache'] = '1d' + cont_attrs['dfuse-data-cache'] = '1m' cont_attrs['dfuse-attr-time'] = '0' cont_attrs['dfuse-dentry-time'] = '0' cont_attrs['dfuse-ndentry-time'] = '0' diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index dbd63f69bbe5..78f132474b5b 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,8 +25,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/offline_rebuild.yaml b/src/tests/ftest/erasurecode/offline_rebuild.yaml index ebd4904ce2ca..7d1a6e7aa08c 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild.yaml @@ -32,8 +32,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/offline_rebuild_aggregation.py b/src/tests/ftest/erasurecode/offline_rebuild_aggregation.py index 5dc238f7a2e4..758acd6359d7 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild_aggregation.py +++ b/src/tests/ftest/erasurecode/offline_rebuild_aggregation.py @@ -64,29 +64,6 @@ def execution(self, agg_trigger=False): # data set self.ior_read_dataset(parity=2) - def test_ec_offline_rebuild_agg_disabled(self): - """Jira ID: DAOS-7313. - - Test Description: Test Erasure code object aggregation disabled mode - with IOR. - Use Case: Create the pool, disabled aggregation, run IOR with supported - EC object type with partial strip. - Verify that Aggregation should not triggered. - Verify the IOR read data at the end. - Kill single server and wait for rebuild. - Read and verify all the data. - Kill second server and wait for rebuild. - Read and verify data with +2 Parity with no data corruption. - - :avocado: tags=all,full_regression - :avocado: tags=hw,large - :avocado: tags=ec,aggregation,ec_array,ec_aggregation,rebuild - :avocado: tags=EcodAggregationOffRebuild,test_ec_offline_rebuild_agg_disabled - """ - # Disable the aggregation - self.pool.set_property("reclaim", "disabled") - self.execution() - def test_ec_offline_rebuild_agg_default(self): """Jira ID: DAOS-7313. diff --git a/src/tests/ftest/erasurecode/offline_rebuild_single.yaml b/src/tests/ftest/erasurecode/offline_rebuild_single.yaml index 5c32aaa29ac5..8b7fba9ae09e 100644 --- a/src/tests/ftest/erasurecode/offline_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/offline_rebuild_single.yaml @@ -30,8 +30,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg pool_query_timeout: 30 container: type: POSIX diff --git a/src/tests/ftest/erasurecode/online_rebuild.yaml b/src/tests/ftest/erasurecode/online_rebuild.yaml index fbfa74934474..f5ea4768df70 100644 --- a/src/tests/ftest/erasurecode/online_rebuild.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild.yaml @@ -31,8 +31,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/online_rebuild_single.yaml b/src/tests/ftest/erasurecode/online_rebuild_single.yaml index 8b0b3f4baf0f..7284a1f7134a 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_single.yaml @@ -30,8 +30,6 @@ server_config: storage: auto pool: size: 93% - svcn: 5 - control_method: dmg pool_query_timeout: 30 properties: rd_fac:2 container: diff --git a/src/tests/ftest/erasurecode/rebuild_disabled.yaml b/src/tests/ftest/erasurecode/rebuild_disabled.yaml index 55939c0b4258..e970e0439729 100644 --- a/src/tests/ftest/erasurecode/rebuild_disabled.yaml +++ b/src/tests/ftest/erasurecode/rebuild_disabled.yaml @@ -32,8 +32,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg pool_query_timeout: 30 container: type: POSIX diff --git a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml index 8a2294753bcf..b4154fcc3692 100644 --- a/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml +++ b/src/tests/ftest/erasurecode/rebuild_disabled_single.yaml @@ -32,8 +32,6 @@ server_config: storage: auto pool: size: 93% - svcn: 1 - control_method: dmg pool_query_timeout: 30 container: type: POSIX diff --git a/src/tests/ftest/ior/crash.py b/src/tests/ftest/ior/crash.py index a41dc920f62d..fe4c8f76ad6c 100644 --- a/src/tests/ftest/ior/crash.py +++ b/src/tests/ftest/ior/crash.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -7,6 +7,7 @@ import time from dmg_utils import check_system_query_status +from general_utils import wait_for_result from ior_test_base import IorTestBase @@ -15,20 +16,18 @@ class IorCrash(IorTestBase): Verify DAOS server does not need to be restarted when an application crashes. :avocado: recursive """ + def verify_cont_handles(self, expected_handles=1): + """Verify number of container handles. If needed, perform multiple queries (with delay). - def setUp(self): - """Set up test before executing.""" - super().setUp() - self.dmg = self.get_dmg_command() + Args: + expected_handles (int): expected number of container handles. Defaults to 1. - def cont_nhandles_match(self, exp_nhandles=1, attempts=5, delay_sec=2): - """Verify container number of handles. If needed, perform multiple queries (with delay).""" - for _ in range(attempts): - if self.container.check_container_info(ci_nhandles=exp_nhandles): - return True - self.log.info("check_container_info does not match yet, sleep %d sec", delay_sec) - time.sleep(delay_sec) - return False + Returns: + bool: whether expected matches actual + """ + return wait_for_result( + self.log, self.container.verify_query, timeout=10, delay=2, + expected_response={'num_handles': expected_handles}) def test_ior_crash(self): """Jira ID: DAOS-4332. @@ -52,6 +51,8 @@ def test_ior_crash(self): :avocado: tags=daosio,ior,dfs :avocado: tags=IorCrash,test_ior_crash """ + dmg = self.get_dmg_command() + # Create pool and container self.pool = self.get_pool(connect=False) self.container = self.get_container(self.pool) @@ -66,13 +67,13 @@ def test_ior_crash(self): self.stop_ior() # Verify engines did not crash - scan_info = self.dmg.system_query(verbose=True) + scan_info = dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed") # Verify container handle opened by ior is closed (by daos_agent after ior crash). # Expect to find one open handle now (a handle opened for this check) - self.assertTrue(self.cont_nhandles_match(), "Error confirming container info nhandles") + self.assertTrue(self.verify_cont_handles(), "Error confirming container info nhandles") # Run IOR and crash it in the middle of Read. # Must wait for Write to complete first. @@ -82,23 +83,23 @@ def test_ior_crash(self): self.stop_ior() # Verify engines did not crash - scan_info = self.dmg.system_query(verbose=True) + scan_info = dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed") # Verify container handle opened by ior is closed (by daos_agent after ior crash). - self.assertTrue(self.cont_nhandles_match(), "Error confirming container info nhandles") + self.assertTrue(self.verify_cont_handles(), "Error confirming container info nhandles") # Run IOR and verify it completes successfully self.run_ior_with_pool(create_pool=False, create_cont=False) # Verify engines did not crash - scan_info = self.dmg.system_query(verbose=True) + scan_info = dmg.system_query(verbose=True) if not check_system_query_status(scan_info): self.fail("One or more engines crashed") # Verify container handle opened by ior is closed (by ior before its graceful exit) # Give ior some time to get started and open the container! # And, expect 2 open handles, one for this container open/query, and another for ior itself - self.assertTrue(self.cont_nhandles_match(exp_nhandles=2, attempts=5, delay_sec=2), + self.assertTrue(self.verify_cont_handles(expected_handles=2), "Error confirming container info nhandles") diff --git a/src/tests/ftest/ior/hard_rebuild.yaml b/src/tests/ftest/ior/hard_rebuild.yaml index a3a5f5f5444c..97137c97b462 100644 --- a/src/tests/ftest/ior/hard_rebuild.yaml +++ b/src/tests/ftest/ior/hard_rebuild.yaml @@ -31,34 +31,28 @@ server_config: log_file: daos_server1.log log_mask: ERR storage: auto -create_pool_max_size: - scm: true - percentage: 90 pool: - control_method: dmg + size: 90% container: type: POSIX control_method: daos - properties: dedup:memcmp ior: api: "DFS" client_processes: np: 32 - dfs_destroy: false iorflags: flags: "-C -k -e -w -g -G 27 -D 120 -Q 1 -vv" read_flags: "-C -k -e -r -R -g -G 27 -D 120 -Q 1 -vv" test_file: daos:testFile segment_count: 2000000 - repetitions: 1 chunk_block_transfer_sizes: # [ChunkSize, BlocksSize, TransferSize] - [47008, 47008, 47008] objectclass: dfs_oclass_list: # - [EC_Object_Class, Minimum number of servers] - - ["EC_2P2G1", 6] - - ["EC_4P2G1", 8] - - ["EC_8P2G1", 12] + - ["EC_2P2GX", 6] + - ["EC_4P2GX", 8] + - ["EC_8P2GX", 12] sw_wearout: 1 sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile" diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 137f9655f476..2f168891d37f 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -8,8 +8,9 @@ import json import logging import os +import re import sys -from argparse import ArgumentParser, RawDescriptionHelpFormatter +from argparse import ArgumentParser, ArgumentTypeError, RawDescriptionHelpFormatter from collections import OrderedDict from tempfile import TemporaryDirectory @@ -259,32 +260,15 @@ def _run(self, args): for key in sorted(args.__dict__.keys()): logger.debug(" %s = %s", key, getattr(args, key)) - # Convert host specifications into NodeSets - try: - test_servers = NodeSet(args.test_servers) - except TypeError: - message = f"Invalid '--test_servers={args.test_servers}' argument" - return self.get_exit_status(1, message, "Setup", sys.exc_info()) - try: - test_clients = NodeSet(args.test_clients) - except TypeError: - message = f"Invalid '--test_clients={args.test_clients}' argument" - return self.get_exit_status(1, message, "Setup", sys.exc_info()) - try: - control_host = NodeSet(args.slurm_control_node) - except TypeError: - message = f"Invalid '--slurm_control_node={args.slurm_control_node}' argument" - return self.get_exit_status(1, message, "Setup", sys.exc_info()) - # A list of server hosts is required - if not test_servers and not args.list: + if not args.test_servers and not args.list: return self.get_exit_status(1, "Missing required '--test_servers' argument", "Setup") - logger.info("Testing with hosts: %s", test_servers.union(test_clients)) - self.details["test hosts"] = str(test_servers.union(test_clients)) + logger.info("Testing with hosts: %s", args.test_servers.union(args.test_clients)) + self.details["test hosts"] = str(args.test_servers.union(args.test_clients)) # Add the installed packages to the details json # pylint: disable=unsupported-binary-operation - all_hosts = test_servers | test_clients | self.local_host + all_hosts = args.test_servers | args.test_clients | self.local_host self.details["installed packages"] = find_packages( logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'") @@ -295,8 +279,8 @@ def _run(self, args): set_test_environment(logger) else: set_test_environment( - logger, test_env, test_servers, test_clients, args.provider, args.insecure_mode, - self.details) + logger, test_env, args.test_servers, args.test_clients, args.provider, + args.insecure_mode, self.details) except TestEnvironmentException as error: message = f"Error setting up test environment: {str(error)}" return self.get_exit_status(1, message, "Setup", sys.exc_info()) @@ -316,8 +300,8 @@ def _run(self, args): # Define the test configs specified by the arguments group = TestGroup( - self.avocado, test_env, test_servers, test_clients, control_host, args.tags, args.nvme, - yaml_dir, args.yaml_extension) + self.avocado, test_env, args.test_servers, args.test_clients, args.slurm_control_node, + args.tags, args.nvme, yaml_dir, args.yaml_extension) try: group.list_tests(logger, args.verbose) except RunException: @@ -335,7 +319,7 @@ def _run(self, args): # Setup the fuse configuration try: - setup_fuse_config(logger, test_servers | test_clients) + setup_fuse_config(logger, args.test_servers | args.test_clients) except LaunchException: # Warn but don't fail message = "Issue detected setting up the fuse configuration" @@ -345,7 +329,7 @@ def _run(self, args): core_files = {} if args.process_cores: try: - all_hosts = test_servers | test_clients | self.local_host + all_hosts = args.test_servers | args.test_clients | self.local_host core_files = get_core_file_pattern(logger, all_hosts) except LaunchException: message = "Error obtaining the core file pattern information" @@ -356,7 +340,7 @@ def _run(self, args): # Determine if bullseye code coverage collection is enabled code_coverage = CodeCoverage(test_env) # pylint: disable=unsupported-binary-operation - code_coverage.check(logger, test_servers | self.local_host) + code_coverage.check(logger, args.test_servers | self.local_host) # Update the test yaml files for the tests in this test group try: @@ -402,6 +386,58 @@ def _run(self, args): return self.get_exit_status(status, "Executing tests complete") +def __arg_type_file(val): + """Parse a file argument. + + Args: + val (str): path to a file + + Returns: + str: the file path + + Raises: + ArgumentTypeError: if val is not a file + """ + if not os.path.isfile(val): + raise ArgumentTypeError(f'File not found: {val}') + return val + + +def __arg_type_nodeset(val): + """Parse a NodeSet argument. + + Args: + val (str): string representation of a NodeSet to parse + + Returns: + NodeSet: the NodeSet + + Raises: + ArgumentTypeError: if val cannot be parsed as a NodeSet + """ + try: + return NodeSet(val) + except Exception as err: # pylint: disable=broad-except + raise ArgumentTypeError(f'Invalid NodeSet: {val}') from err + + +def __arg_type_find_size(val): + """Parse a find -size argument. + + Args: + val (str): string representation of find -size argument + + Returns: + str: the find -size argument + + Raises: + ArgumentTypeError: if val cannot be parsed as a find -size argument + """ + if not re.match(r'^[0-9]+[bcwkMG]?$', val): + raise ArgumentTypeError(f'Invalid find -size argument: {val}') + return val + + def main(): """Launch DAOS functional tests.""" # Parse the command line arguments @@ -479,7 +515,7 @@ def main(): "-e", "--extra_yaml", action="append", default=None, - type=str, + type=__arg_type_file, help="additional yaml file to include with the test yaml file. Any " "entries in the extra yaml file can be used to replace an " "existing entry in the test yaml file.") @@ -566,8 +602,8 @@ def main(): parser.add_argument( "-sc", "--slurm_control_node", action="store", - default=str(get_local_host()), - type=str, + type=__arg_type_nodeset, + default=get_local_host(), help="slurm control node where scontrol commands will be issued to check for the existence " "of any slurm partitions required by the tests") parser.add_argument( @@ -602,14 +638,17 @@ def main(): parser.add_argument( "-tc", "--test_clients", action="store", + type=__arg_type_nodeset, + default=NodeSet(), help="comma-separated list of hosts to use as replacement values for " "client placeholders in each test's yaml file") parser.add_argument( "-th", "--logs_threshold", action="store", + type=__arg_type_find_size, help="collect log sizes and report log sizes that go past provided" "threshold. e.g. '-th 5M'" - "Valid threshold units are: B, K, M, G, T") + "Valid threshold units are: b, c, w, k, M, G for find -size") parser.add_argument( "-tm", "--timeout_multiplier", action="store", @@ -619,6 +658,8 @@ def main(): parser.add_argument( "-ts", "--test_servers", action="store", + type=__arg_type_nodeset, + default=NodeSet(), help="comma-separated list of hosts to use as replacement values for " "server placeholders in each test's yaml file. If the " "'--test_clients' argument is not specified, this list of hosts " diff --git a/src/tests/ftest/mpiio/llnl_mpi4py.py b/src/tests/ftest/mpiio/llnl_mpi4py.py index 1800d70e71fe..627fe6c637c8 100644 --- a/src/tests/ftest/mpiio/llnl_mpi4py.py +++ b/src/tests/ftest/mpiio/llnl_mpi4py.py @@ -7,6 +7,7 @@ import os import site +from apricot import skipForTicket from mpiio_test_base import MpiioTests @@ -55,6 +56,7 @@ def test_llnl(self): test_repo = self.params.get("llnl", '/run/test_repo/') self.run_test(test_repo, "llnl") + @SkipForTicket("DAOS-14369") def test_mpi4py(self): """Jira ID: DAOS-2231 diff --git a/src/tests/ftest/nvme/fault.py b/src/tests/ftest/nvme/fault.py index 5cb30c3815ed..8abf2e56b6a4 100644 --- a/src/tests/ftest/nvme/fault.py +++ b/src/tests/ftest/nvme/fault.py @@ -37,7 +37,7 @@ def test_nvme_fault(self): :avocado: tags=nvme_fault,test_nvme_fault """ # Create the Pool with Maximum NVMe size - self.create_pool_max_size(nvme=True) + self.add_pool() # Start the IOR Command and generate the NVMe fault. self.start_ior_load(operation="Auto_Write", percent=self.capacity) diff --git a/src/tests/ftest/nvme/fault.yaml b/src/tests/ftest/nvme/fault.yaml index ba862afeec2e..ea7021e2d68d 100644 --- a/src/tests/ftest/nvme/fault.yaml +++ b/src/tests/ftest/nvme/fault.yaml @@ -31,7 +31,7 @@ dmg: transport_config: allow_insecure: True pool: - scm_size: 50GB + size: 96% control_method: dmg container: type: POSIX diff --git a/src/tests/ftest/performance/ior_easy.py b/src/tests/ftest/performance/ior_easy.py index f95d83c34f83..2be1f8983417 100644 --- a/src/tests/ftest/performance/ior_easy.py +++ b/src/tests/ftest/performance/ior_easy.py @@ -57,54 +57,6 @@ def test_performance_ior_easy_dfuse_ec_16p2gx(self): """ self.run_performance_ior(namespace="/run/ior_dfuse_ec_16p2gx/*") - def test_performance_ior_easy_dfs_ec_4p2gx_stop_write(self): - """Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during write - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_write - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_4p2gx/*", - stop_delay_write=0.5) - - def test_performance_ior_easy_dfs_ec_4p2gx_stop_read(self): - """Test Description: Run IOR Easy, DFS, EC_4P2GX, stop a rank during read. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_4p2gx_stop_read - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_4p2gx/*", - stop_delay_read=0.5) - - def test_performance_ior_easy_dfs_ec_16p2gx_stop_write(self): - """Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during write. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_write - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_16p2gx/*", - stop_delay_write=0.5) - - def test_performance_ior_easy_dfs_ec_16p2gx_stop_read(self): - """Test Description: Run IOR Easy, DFS, EC_16P2GX, stop a rank during read. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_ior,performance_ior_easy,performance_dfs - :avocado: tags=IorEasy,test_performance_ior_easy_dfs_ec_16p2gx_stop_read - """ - self.run_performance_ior( - namespace="/run/ior_dfs_ec_16p2gx/*", - stop_delay_read=0.5) - def test_performance_ior_easy_hdf5_sx(self): """Test Description: Run IOR Easy, HDF5, SX. diff --git a/src/tests/ftest/performance/mdtest_easy.py b/src/tests/ftest/performance/mdtest_easy.py index 99ae476f02bd..ba08938553c0 100644 --- a/src/tests/ftest/performance/mdtest_easy.py +++ b/src/tests/ftest/performance/mdtest_easy.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2019-2022 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,27 +43,3 @@ def test_performance_mdtest_easy_dfuse_s1(self): :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfuse_s1,dfuse """ self.run_performance_mdtest(namespace="/run/mdtest_dfuse_s1/*") - - def test_performance_mdtest_easy_dfs_ec_4p2g1_stop(self): - """Test Description: Run MDTest Easy, DFS, EC_4P2G1, stop a rank. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs - :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_4p2g1_stop - """ - self.run_performance_mdtest( - namespace="/run/mdtest_dfs_ec_4p2g1/*", - stop_delay=0.5) - - def test_performance_mdtest_easy_dfs_ec_16p2g1_stop(self): - """Test Description: Run MDTest Easy, DFS, EC_16P2G1, stop a rank. - - :avocado: tags=all,manual - :avocado: tags=hw,medium - :avocado: tags=performance,performance_mdtest,performance_mdtest_easy,performance_dfs - :avocado: tags=MdtestEasy,test_performance_mdtest_easy_dfs_ec_16p2g1_stop - """ - self.run_performance_mdtest( - namespace="/run/mdtest_dfs_ec_16p2g1/*", - stop_delay=0.5) diff --git a/src/tests/ftest/pool/create_capacity.py b/src/tests/ftest/pool/create_capacity.py index 451be26f77f6..f8e009456e7b 100644 --- a/src/tests/ftest/pool/create_capacity.py +++ b/src/tests/ftest/pool/create_capacity.py @@ -1,17 +1,15 @@ """ -(C) Copyright 2021-2023 Intel Corporation. +(C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ import time from apricot import TestWithServers -from server_utils import ServerFailed from test_utils_pool import add_pool, check_pool_creation class PoolCreateCapacityTests(TestWithServers): - # pylint: disable=too-few-public-methods """Pool create tests. All of the tests verify pool create performance with 7 servers and 1 client. @@ -24,65 +22,60 @@ def setUp(self): """Set up each test case.""" # Create test-case-specific DAOS log files self.update_log_file_names() + super().setUp() def test_create_pool_quantity(self): """JIRA ID: DAOS-5114 / SRS-2 / SRS-4. Test Description: - Create 200 pools on all of the servers. + Create a given number pools on all of the servers. Perform an orderly system shutdown via cmd line (dmg). Restart the system via cmd line tool (dmg). Verify that DAOS is ready to accept requests within 2 minutes. + Verify that all the created pools exists after the restart :avocado: tags=all,daily_regression :avocado: tags=hw,medium :avocado: tags=pool :avocado: tags=PoolCreateCapacityTests,test_create_pool_quantity """ - # Create some number of pools each using a equal amount of 60% of the - # available capacity, e.g. 0.6% for 100 pools. - quantity = self.params.get("quantity", "/run/pool/*", 1) - storage = self.server_managers[0].get_available_storage() - if storage['nvme'] < 750156374016: - self.log.info( - 'Reducing pool quantity from %s -> 150 due to insufficient NVMe capacity (%s < ' - '750156374016)', quantity, storage['nvme']) - quantity = 150 + daos_restart_duration = self.params.get("restart_duration", "/run/server_config/*", 0) + pool_create_duration = self.params.get("create_duration", "/run/pool/*", 0) + pool_quantity = self.params.get("quantity", "/run/pool/*", None) # Define all the pools with the same size defined in the test yaml - self.log_step('Defining {} pools'.format(quantity)) + self.log_step(f"Defining {pool_quantity[1]} pools to create") pools = [] - for _ in range(quantity): + for _ in range(pool_quantity[1]): pools.append(add_pool(self, create=False)) # Create all the pools - self.log_step('Creating {} pools (dmg pool create)'.format(quantity)) + self.log_step(f"Attempt to create {pool_quantity[1]} pools (dmg pool create)") self.get_dmg_command().server_set_logmasks("DEBUG", raise_exception=False) - check_pool_creation(self, pools, 30, 2) + pools = check_pool_creation(self, pools, pool_create_duration, minimum=pool_quantity[0]) self.get_dmg_command().server_set_logmasks(raise_exception=False) - # Verify DAOS can be restarted in less than 2 minutes + # Shutdown DAOS file system') self.log_step('Stopping all engines (dmg system stop)') - try: - self.server_managers[0].system_stop() - except ServerFailed as error: - self.fail(error) + self.server_managers[0].system_stop() - start = float(time.time()) + # Restarting DAOS file system self.log_step('Starting all engines (dmg system start)') - try: - self.server_managers[0].system_start() - except ServerFailed as error: - self.fail(error) - + start = float(time.time()) + self.server_managers[0].system_start() duration = float(time.time()) - start - self.log_step('Verifying all engines started in 120 seconds: {}'.format(duration)) - if duration > 120: - self.fail("DAOS not ready to accept requests within 2 minutes after restart") + + # Verify that DAOS is ready to accept requests within a duration defined in the test yaml + self.log_step( + f"Verifying all engines started in {daos_restart_duration}s") + if duration > daos_restart_duration: + self.fail( + 'DAOS file system is not ready to accept requests within ' + f"{daos_restart_duration}s after restart: got={duration}s") # Verify all the pools exists after the restart - self.log_step('Verifying all {} pools exist after engine restart'.format(quantity)) + self.log_step(f"Verifying all {len(pools)} pools exist after engines restart") self.get_dmg_command().timeout = 360 pool_uuids = self.get_dmg_command().get_pool_list_uuids(no_query=True) detected_pools = [uuid.lower() for uuid in pool_uuids] @@ -93,9 +86,11 @@ def test_create_pool_quantity(self): missing_pools.append(pool_uuid) if missing_pools: self.fail( - 'The following created pools were not detected in the pool ' - 'list after rebooting the servers:\n [{}]: {}'.format( - len(missing_pools), ", ".join(missing_pools))) + f"{len(missing_pools)} pools are missing after engines restart: " + f"miss=[{', '.join(missing_pools)}]") if len(pools) != len(detected_pools): - self.fail('Incorrect number of pools detected after rebooting the servers') + self.fail( + 'Incorrect number of pools detected after engines restart: ' + f"wait={len(pools)}, got={len(detected_pools)}") + self.log_step('Test passed') diff --git a/src/tests/ftest/pool/create_capacity.yaml b/src/tests/ftest/pool/create_capacity.yaml index f7cc768611ba..9ba2c5652fe5 100644 --- a/src/tests/ftest/pool/create_capacity.yaml +++ b/src/tests/ftest/pool/create_capacity.yaml @@ -3,11 +3,13 @@ hosts: test_clients: 1 timeouts: - test_create_pool_quantity: 1200 + # NOTE DAOS-14528: Half of the time is needed to destroy pool in the teardown + test_create_pool_quantity: 900 server_config: name: daos_server engines_per_host: 2 + restart_duration: 120 engines: 0: pinned_numa_node: 0 @@ -18,6 +20,8 @@ server_config: log_mask: DEBUG targets: 1 env_vars: + # NOTE The minimal size of the DAOS RDB is equal to 128MiB + - DAOS_MD_CAP=128 - DD_MASK=group_metadata_only - D_LOG_FLUSH=DEBUG storage: auto @@ -30,6 +34,8 @@ server_config: log_mask: DEBUG targets: 1 env_vars: + # NOTE The minimal size of the DAOS RDB is equal to 128MiB + - DAOS_MD_CAP=128 - DD_MASK=group_metadata_only - D_LOG_FLUSH=DEBUG storage: auto @@ -37,6 +43,8 @@ server_config: pool: control_method: dmg scm_size: 192MiB + # NOTE The minimal size of DATA with MD on SCM is equal to 1GiB (i.e. size of a SPDK cluster) nvme_size: 1024MiB - quantity: 200 + quantity: [150, 200] + create_duration: 30 set_logmasks: False diff --git a/src/tests/ftest/rebuild/cascading_failures.yaml b/src/tests/ftest/rebuild/cascading_failures.yaml index 8b9b272b4bd0..c5fbb7e28cd2 100644 --- a/src/tests/ftest/rebuild/cascading_failures.yaml +++ b/src/tests/ftest/rebuild/cascading_failures.yaml @@ -15,9 +15,7 @@ server_config: scm_mount: /mnt/daos system_ram_reserved: 1 pool: - scm_size: 1073741824 - svcn: 2 - control_method: dmg + size: 1G pool_query_timeout: 30 properties: rd_fac:2 container: diff --git a/src/tests/ftest/rebuild/container_rf.py b/src/tests/ftest/rebuild/container_rf.py index 51037212556c..c0daa01f6be3 100644 --- a/src/tests/ftest/rebuild/container_rf.py +++ b/src/tests/ftest/rebuild/container_rf.py @@ -39,5 +39,4 @@ def test_rebuild_with_container_rf(self): :avocado: tags=container,rebuild,container_rf :avocado: tags=RbldContRfTest,test_rebuild_with_container_rf """ - self.mode = "cont_rf_with_rebuild" - self.execute_cont_rf_test() + self.execute_cont_rf_test(mode="cont_rf_with_rebuild") diff --git a/src/tests/ftest/rebuild/container_rf.yaml b/src/tests/ftest/rebuild/container_rf.yaml index eb436baa2d37..f1333dbbadc4 100644 --- a/src/tests/ftest/rebuild/container_rf.yaml +++ b/src/tests/ftest/rebuild/container_rf.yaml @@ -1,5 +1,5 @@ hosts: - test_servers: 6 + test_servers: 7 test_clients: 1 timeout: 480 server_config: @@ -15,9 +15,8 @@ server_config: scm_mount: /mnt/daos system_ram_reserved: 1 pool: - scm_size: 1073741824 - svcn: 3 - control_method: dmg + size: 1G + svcn: 7 # To match number of servers pool_query_timeout: 30 container: control_method: daos diff --git a/src/tests/ftest/rebuild/delete_objects.yaml b/src/tests/ftest/rebuild/delete_objects.yaml index bb99fe48d70a..e80a2aec9c4d 100644 --- a/src/tests/ftest/rebuild/delete_objects.yaml +++ b/src/tests/ftest/rebuild/delete_objects.yaml @@ -16,9 +16,7 @@ server_config: system_ram_reserved: 1 pool: scm_size: 1073741824 - svcn: 2 debug: true - control_method: dmg pool_query_timeout: 30 properties: rd_fac:2 container: diff --git a/src/tests/ftest/rebuild/read_array.yaml b/src/tests/ftest/rebuild/read_array.yaml index 6bb9e559a17a..b7a64dfc3e28 100644 --- a/src/tests/ftest/rebuild/read_array.yaml +++ b/src/tests/ftest/rebuild/read_array.yaml @@ -19,9 +19,7 @@ server_config: scm_mount: /mnt/daos system_ram_reserved: 1 pool: - scm_size: 1073741824 - svcn: 2 - control_method: dmg + size: 1G pool_query_timeout: 30 properties: rd_fac:2 container: diff --git a/src/tests/ftest/rebuild/widely_striped.py b/src/tests/ftest/rebuild/widely_striped.py index 470926df05dd..e5d88e6d2205 100644 --- a/src/tests/ftest/rebuild/widely_striped.py +++ b/src/tests/ftest/rebuild/widely_striped.py @@ -42,7 +42,7 @@ def test_rebuild_widely_striped(self): :avocado: tags=all,full_regression :avocado: tags=hw,large - :avocado: tags=rebuild + :avocado: tags=rebuild,mdtest :avocado: tags=RbldWidelyStriped,test_rebuild_widely_striped """ # set params diff --git a/src/tests/ftest/rebuild/widely_striped.yaml b/src/tests/ftest/rebuild/widely_striped.yaml index fdeb6090d11c..40853a14b20f 100644 --- a/src/tests/ftest/rebuild/widely_striped.yaml +++ b/src/tests/ftest/rebuild/widely_striped.yaml @@ -18,15 +18,14 @@ testparams: pool: scm_size: 10G nvme_size: 60G - svcn: 5 - control_method: dmg rebuild_timeout: 240 pool_query_timeout: 60 + properties: rd_fac:2 container: type: POSIX control_method: daos oclass: RP_3G1 - properties: "rd_fac:2" + properties: rd_fac:2 mdtest: api: DFS client_processes: diff --git a/src/tests/ftest/rebuild/with_ior.yaml b/src/tests/ftest/rebuild/with_ior.yaml index d58b65f9ea0a..44e02c613972 100644 --- a/src/tests/ftest/rebuild/with_ior.yaml +++ b/src/tests/ftest/rebuild/with_ior.yaml @@ -29,8 +29,7 @@ server_config: pool: scm_size: 6G - svcn: 3 - control_method: dmg + properties: rd_fac:1 pool_query_timeout: 30 container: diff --git a/src/tests/ftest/scrubber/check_csum_metrics_mdtest.py b/src/tests/ftest/scrubber/check_csum_metrics_mdtest.py index 3c1e9befab2c..b94e3ab092a2 100644 --- a/src/tests/ftest/scrubber/check_csum_metrics_mdtest.py +++ b/src/tests/ftest/scrubber/check_csum_metrics_mdtest.py @@ -25,7 +25,7 @@ def test_scrubber_csum_metrics_with_mdtest(self): and compare it with initial values. :avocado: tags=all,full_regression :avocado: tags=hw,medium - :avocado: tags=scrubber + :avocado: tags=scrubber,mdtest :avocado: tags=CheckCsumMetricsMdtest,test_scrubber_csum_metrics_with_mdtest """ diff --git a/src/tests/ftest/server/metadata.py b/src/tests/ftest/server/metadata.py index 18cf9edf70fc..30c96a901f93 100644 --- a/src/tests/ftest/server/metadata.py +++ b/src/tests/ftest/server/metadata.py @@ -65,7 +65,7 @@ class ObjectMetadata(TestWithServers): CREATED_CONTAINERS_MIN = 2900 # Number of created containers that should not be possible - CREATED_CONTAINERS_LIMIT = 3500 + CREATED_CONTAINERS_LIMIT = 7500 def __init__(self, *args, **kwargs): """Initialize a TestWithServers object.""" diff --git a/src/tests/ftest/server/metadata.yaml b/src/tests/ftest/server/metadata.yaml index 5eb8dea5330c..a0e71cce8d7a 100644 --- a/src/tests/ftest/server/metadata.yaml +++ b/src/tests/ftest/server/metadata.yaml @@ -49,7 +49,6 @@ server_config: pool: svcn: 5 scm_size: 1G - nvme_size: 8G control_method: dmg container: control_method: API diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 682bca42bbe3..11f5c67dd952 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -150,7 +150,6 @@ def __init__(self, *args, **kwargs): def setUp(self): """Set up each test case.""" # get paths from the build_vars generated by build - try: with open('../../.build_vars.json') as build_vars: build_paths = json.load(build_vars) diff --git a/src/tests/ftest/util/container_rf_test_base.py b/src/tests/ftest/util/container_rf_test_base.py index cf89ea6355c5..99e49c1532c3 100644 --- a/src/tests/ftest/util/container_rf_test_base.py +++ b/src/tests/ftest/util/container_rf_test_base.py @@ -1,11 +1,10 @@ """ - (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ import re -from daos_utils import DaosCommand from general_utils import DaosTestError from rebuild_test_base import RebuildTestBase @@ -15,21 +14,6 @@ class ContRedundancyFactor(RebuildTestBase): :avocado: recursive """ - - def __init__(self, *args, **kwargs): - """Initialize a CascadingFailures object.""" - super().__init__(*args, **kwargs) - self.mode = None - self.daos_cmd = None - - def create_test_container(self): - """Create a container and write objects.""" - self.log.info( - "==>(1)Create pool and container with redundant factor," - " start background IO object write") - self.container.create() - self.container.write_objects(self.inputs.rank.value[0], self.inputs.object_class.value) - def verify_rank_has_objects(self): """Verify the first rank to be excluded has at least one object.""" rank_list = self.container.get_target_rank_lists(" before rebuild") @@ -61,8 +45,7 @@ def verify_cont_rf_healthstatus(self, expected_rf, expected_health): actual_rf = None actual_health = None - cont_props = self.daos_cmd.container_get_prop( - pool=self.pool.uuid, cont=self.container.uuid, properties=["rd_fac", "status"]) + cont_props = self.container.get_prop(properties=["rd_fac", "status"]) for cont_prop in cont_props["response"]: if cont_prop["name"] == "rd_fac": actual_rf = cont_prop["value"] @@ -144,19 +127,20 @@ def create_test_container_and_write_obj(self, negative_test=False): self.fail("#Negative test, container redundancy factor " "test failed, return error RC: -1003 not found") - def execute_cont_rf_test(self, create_container=True): + def execute_cont_rf_test(self, create_container=True, mode=None): """Execute the rebuild test steps for container rd_fac test. Args: create_container (bool, optional): should the test create a container. Defaults to True. + mode (str): either "cont_rf_with_rebuild" or "cont_rf_enforcement" """ # Get the test params and var self.setup_test_pool() - self.daos_cmd = DaosCommand(self.bin) if create_container: self.setup_test_container() oclass = self.inputs.object_class.value + # Negative testing pertains to RF enforcement when creating objects - not rebuild negative_test = True rd_fac = ''.join(self.container.properties.value.split(":")) rf_match = re.search(r"rd_fac([0-9]+)", rd_fac) @@ -171,7 +155,8 @@ def execute_cont_rf_test(self, create_container=True): self.create_test_pool() # Create a container and write objects self.create_test_container_and_write_obj(negative_test) - if self.mode == "cont_rf_with_rebuild": + + if mode == "cont_rf_with_rebuild": num_of_ranks = len(self.inputs.rank.value) if num_of_ranks > rf_num: expect_cont_status = "UNCLEAN" @@ -186,7 +171,7 @@ def execute_cont_rf_test(self, create_container=True): # Refresh local pool and container self.log.info("==>(6)Check for pool and container info after rebuild.") self.pool.check_pool_info() - self.container.check_container_info() + self.container.query() # Verify the excluded rank is no longer used with the objects self.verify_rank_has_no_objects() # Verify the pool information after rebuild @@ -196,7 +181,7 @@ def execute_cont_rf_test(self, create_container=True): self.log.info("==>(7)Check for container data if the container is healthy.") self.verify_container_data() self.log.info("Test passed") - elif self.mode == "cont_rf_enforcement": + elif mode == "cont_rf_enforcement": self.log.info("Container rd_fac test passed") else: - self.fail("#Unsupported container_rf test mode") + self.fail(f"Unsupported container_rf test mode: {mode}") diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index 1baa93b91b4f..9bf0ff4c501b 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2018-2023 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -96,14 +96,6 @@ def start_server_managers(self, force=False): ["=".join(items) for items in list(env_dict.items())] ) - # Update any other server settings unique to this test method - for setting in ["crt_timeout"]: - value = self.get_test_param(setting) - if value: - for server_mgr in self.server_managers: - for engine_params in server_mgr.manager.job.yaml.engine_params: - engine_params.set_value(setting, value) - # Start the servers return super().start_server_managers(force=force) diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index 7bb310f6303e..9ae5581b45cc 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -31,7 +31,6 @@ def _get_build_environment(logger, build_vars_file): Returns: str: The prefix of the DAOS install. None: If the file is not present. - """ logger.debug("Obtaining DAOS build environment from %s", build_vars_file) try: @@ -169,8 +168,12 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu all_hosts = NodeSet() all_hosts.update(servers) all_hosts.update(clients) - self.provider = provider - self.insecure_mode = insecure_mode + + # Override values if explicitly specified + if provider is not None: + self.provider = provider + if insecure_mode is not None: + self.insecure_mode = insecure_mode if self.log_dir is None: self.log_dir = self.default_log_dir() @@ -551,7 +554,10 @@ def set_test_environment(logger, test_env=None, servers=None, clients=None, prov # Update the PATH environment variable build_vars_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", ".build_vars.json") +<<<<<<< HEAD # os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "..", ".build_vars.json") +======= +>>>>>>> amd/avocado-version _update_path(logger, build_vars_file) # Get the default fabric interface and provider diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index 8646d4ef0ac0..21931c220319 100644 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -67,6 +67,9 @@ def execute_mdtest(self, out_queue=None, display_space=True): Args: out_queue (queue, optional): Pass any exceptions in a queue. Defaults to None. display_space (bool, optional): Whether to display the pool space. Defaults to True. + + Returns: + object: result of job manager run """ # Create a pool if one does not already exist if self.pool is None: @@ -83,17 +86,20 @@ def execute_mdtest(self, out_queue=None, display_space=True): self.mdtest_cmd.test_dir.update(self.dfuse.mount_dir.value) # Run Mdtest - self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), - self.processes, display_space=display_space, out_queue=out_queue) + out = self.run_mdtest( + self.get_mdtest_job_manager_command(self.manager), + self.processes, display_space=display_space, out_queue=out_queue) if self.subprocess: - return + return out # reset self.container if dfs_destroy is True or None. if self.mdtest_cmd.dfs_destroy is not False: self.container = None self.stop_dfuse() + return out + def get_mdtest_job_manager_command(self, mpi_type): """Get the MPI job manager command for Mdtest. diff --git a/src/tests/ftest/util/performance_test_base.py b/src/tests/ftest/util/performance_test_base.py index bf7a24907ef0..4fdb8ae06d4a 100644 --- a/src/tests/ftest/util/performance_test_base.py +++ b/src/tests/ftest/util/performance_test_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -9,7 +9,6 @@ import oclass_utils from avocado.core.exceptions import TestFail from exception_utils import CommandFailure -from general_utils import get_subprocess_stdout from ior_test_base import IorTestBase from ior_utils import IorMetrics from mdtest_test_base import MdtestBase @@ -217,9 +216,7 @@ def verify_system_status(self, pool=None, container=None): if pool: funcs.append(pool.set_query_data) if container: - funcs.append( - lambda: self.log.info( - self.daos_cmd.container_query(container.pool.identifier, container.uuid))) + funcs.append(container.query) first_error = None for func in funcs: @@ -254,45 +251,21 @@ def verify_oclass_engine_count(self, oclass, fail=True): return False return True - def restart_servers(self): - """Restart the servers.""" - self.log.info("Restarting servers") - self.dmg_cmd.system_stop(True) - if self.dmg_cmd.result.exit_status != 0: - self.fail("Failed to stop servers") - time.sleep(5) - self.dmg_cmd.system_start() - if self.dmg_cmd.result.exit_status != 0: - self.fail("Failed to start servers") - self.server_managers[0].detect_engine_start() - - def _run_performance_ior_single(self, stop_rank_s=None, intercept=None): + def _run_performance_ior_single(self, intercept=None): """Run a single IOR execution. Args: - stop_rank_s (float, optional): stop a rank this many seconds after starting IOR. - Default is None, which does not stop a rank. intercept (str, optional): path to interception library. """ - # Always run as a subprocess so we can stop ranks during IO - self.subprocess = True - - self.run_ior_with_pool( - create_pool=False, - create_cont=False, - intercept=intercept, - display_space=False, - stop_dfuse=False - ) - if stop_rank_s is not None: - time.sleep(stop_rank_s) - self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0]) - ior_returncode = self.job_manager.process.wait() try: - if ior_returncode != 0: - self.fail("IOR failed") - ior_output = get_subprocess_stdout(self.job_manager.process) + ior_output = self.run_ior_with_pool( + create_pool=False, + create_cont=False, + intercept=intercept, + display_space=False, + stop_dfuse=False + ) ior_metrics = self.ior_cmd.get_ior_metrics(ior_output) for metrics in ior_metrics: if metrics[0] == "write": @@ -309,9 +282,7 @@ def _run_performance_ior_single(self, stop_rank_s=None, intercept=None): # Try this even if IOR failed because it could give us useful info self.verify_system_status(self.pool, self.container) - def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_write=None, - stop_delay_read=None, num_iterations=1, - restart_between_iterations=True): + def run_performance_ior(self, namespace=None, use_intercept=True): """Run an IOR performance test. Write and Read are ran separately. @@ -321,26 +292,8 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri Defaults to None, which uses default IOR namespace. use_intercept (bool, optional): whether to use the interception library with dfuse. Defaults to True. - stop_delay_write (float, optional): fraction of stonewall time after which to stop a - rank during write phase. Must be between 0 and 1. Default is None. - stop_delay_read (float, optional): fraction of stonewall time after which to stop a - rank during read phase. Must be between 0 and 1. Default is None. - num_iterations (int, optional): number of times to run the tests. - Default is 1. - restart_between_iterations (int, optional): whether to restart the servers between - iterations. Default is True. """ - if stop_delay_write is not None and (stop_delay_write < 0 or stop_delay_write > 1): - self.fail("stop_delay_write must be between 0 and 1") - if stop_delay_read is not None and (stop_delay_read < 0 or stop_delay_read > 1): - self.fail("stop_delay_read must be between 0 and 1") - if stop_delay_write is not None and stop_delay_read is not None: - # This isn't straightforward, because stopping a rank during write degrades - # performance, so read tries to read the same number of bytes as write, - # but might finish before the rank is stopped. - self.fail("stop_delay_write and stop_delay_read cannot be used together") - if namespace is not None: self.ior_cmd.namespace = namespace self.ior_cmd.get_params(self) @@ -351,13 +304,6 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri else: intercept = None - # Calculate both stop delays upfront since read phase will remove stonewall - stop_rank_write_s = stop_rank_read_s = None - if stop_delay_write and self.ior_cmd.sw_deadline.value: - stop_rank_write_s = stop_delay_write * self.ior_cmd.sw_deadline.value - if stop_delay_read and self.ior_cmd.sw_deadline.value: - stop_rank_read_s = stop_delay_read * self.ior_cmd.sw_deadline.value - # Save write and read params for switching write_flags = self.params.get("write_flags", self.ior_cmd.namespace) read_flags = self.params.get("read_flags", self.ior_cmd.namespace) @@ -376,7 +322,7 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri # Set the container redundancy factor to match the oclass cont_rf = oclass_utils.extract_redundancy_factor(self.ior_cmd.dfs_oclass.value) - # Create pool and container upfront for flexibility and so rank stop timing is accurate + # Create pool and container upfront for flexibility self.pool = self.get_pool(connect=False) params = {} if self.ior_cmd.dfs_oclass.value: @@ -391,50 +337,33 @@ def run_performance_ior(self, namespace=None, use_intercept=True, stop_delay_wri self.container.create() self.update_ior_cmd_with_pool(False) - for iteration in range(num_iterations): - if restart_between_iterations and iteration > 0: - self.restart_servers() - - self.log.info("Running IOR write (%s)", str(iteration)) - self.ior_cmd.flags.update(write_flags) - self._run_performance_ior_single(stop_rank_write_s, intercept) - - # Manually stop dfuse after ior write completes - self.stop_dfuse() - - # Wait for rebuild if we stopped a rank - if stop_rank_write_s: - self.pool.wait_for_rebuild_to_end() + self.log_step("Running IOR write") + self.ior_cmd.flags.update(write_flags) + self._run_performance_ior_single(intercept) - # Wait between write and read - self.phase_barrier() + # Manually stop dfuse after ior write completes + self.stop_dfuse() - self.log.info("Running IOR read (%s)", str(iteration)) - self.ior_cmd.flags.update(read_flags) - self._run_performance_ior_single(stop_rank_read_s, intercept) + # Wait between write and read + self.phase_barrier() - # Manually stop dfuse after ior read completes - self.stop_dfuse() + self.log_step("Running IOR read") + self.ior_cmd.flags.update(read_flags) + self._run_performance_ior_single(intercept) - # Wait for rebuild if we stopped a rank - if stop_rank_read_s: - self.pool.wait_for_rebuild_to_end() + # Manually stop dfuse after ior read completes + self.stop_dfuse() self._log_daos_metrics() - def run_performance_mdtest(self, namespace=None, stop_delay=None): + def run_performance_mdtest(self, namespace=None): """Run an MDTest performance test. Args: namespace (str, optional): namespace for MDTest parameters in the yaml. Defaults to None, which uses default MDTest namespace. - stop_delay (float, optional): fraction of stonewall time after which to stop a - rank. Must be between 0 and 1. Defaults to None. """ - if stop_delay is not None and (stop_delay < 0 or stop_delay > 1): - self.fail("stop_delay must be between 0 and 1") - if namespace is not None: self.mdtest_cmd.namespace = namespace self.mdtest_cmd.get_params(self) @@ -445,8 +374,6 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): if self.mdtest_cmd.api.value not in ('DFS', 'POSIX'): self.fail("Only DFS API supported") - stop_rank_s = (stop_delay or 0) * (self.mdtest_cmd.stonewall_timer.value or 0) - self._log_performance_params("MDTEST") self.verify_oclass_engine_count(self.mdtest_cmd.dfs_oclass.value) @@ -484,20 +411,10 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): # Never let execute_mdtest automatically destroy the container self.mdtest_cmd.dfs_destroy.update(False) - # Always run as a subprocess so we can stop ranks during IO - self.subprocess = True - self.log.info("Running MDTEST") - self.execute_mdtest(display_space=False) - if stop_rank_s: - time.sleep(stop_rank_s) - self.server_managers[0].stop_random_rank(self.d_log, force=True, exclude_ranks=[0]) - mdtest_returncode = self.job_manager.process.wait() try: - if mdtest_returncode != 0: - self.fail("mdtest failed") - mdtest_output = get_subprocess_stdout(self.job_manager.process) - mdtest_metrics = MdtestMetrics(mdtest_output) + mdtest_result = self.execute_mdtest(display_space=False) + mdtest_metrics = MdtestMetrics(mdtest_result.stdout_text) if not mdtest_metrics: self.fail("Failed to get mdtest metrics") log_list = [] @@ -523,8 +440,4 @@ def run_performance_mdtest(self, namespace=None, stop_delay=None): # Manually stop dfuse after mdtest completes self.stop_dfuse() - # Wait for rebuild if we stopped a rank - if stop_rank_s: - self.pool.wait_for_rebuild_to_end() - self._log_daos_metrics() diff --git a/src/tests/ftest/util/rebuild_test_base.py b/src/tests/ftest/util/rebuild_test_base.py index bdb31ea9807a..c3df3c1efd02 100644 --- a/src/tests/ftest/util/rebuild_test_base.py +++ b/src/tests/ftest/util/rebuild_test_base.py @@ -191,7 +191,7 @@ def execute_rebuild_test(self, create_container=True): # Refresh local pool and container self.pool.check_pool_info() - self.container.check_container_info() + self.container.query() # Verify the excluded rank is no longer used with the objects self.verify_rank_has_no_objects() diff --git a/src/tests/ftest/util/server_utils_base.py b/src/tests/ftest/util/server_utils_base.py index bd8b43acefdf..8362f6ea5c28 100644 --- a/src/tests/ftest/util/server_utils_base.py +++ b/src/tests/ftest/util/server_utils_base.py @@ -27,7 +27,7 @@ class AutosizeCancel(Exception): class DaosServerCommand(YamlCommand): """Defines an object representing the daos_server command.""" - NORMAL_PATTERN = "DAOS I/O Engine.*started" + NORMAL_PATTERN = "DAOS I/O Engine.*process [0-9]+ started on" FORMAT_PATTERN = "(SCM format required)(?!;)" REFORMAT_PATTERN = "Metadata format required" SYSTEM_QUERY_PATTERN = "joined" diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index e5ee2c2a1fb8..cc0ae3ac18b0 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -419,8 +419,7 @@ def launch_vmd_identify_check(self, name, results, args): for uuid in uuids: # Blink led - self.dmg_command.storage_led_identify(ids=uuid, reset=True) - time.sleep(2) + self.dmg_command.storage_led_identify(ids=uuid, timeout=2) # check if led is blinking result = self.dmg_command.storage_led_check(ids=uuid) # determine if leds are blinking as expected @@ -430,6 +429,9 @@ def launch_vmd_identify_check(self, name, results, args): if device['ctrlr']['led_state'] != "QUICK_BLINK": failing_vmd.append([device['ctrlr']['pci_addr'], value['hosts']]) status = False + # reset leds to previous state + for uuid in uuids: + self.dmg_command.storage_led_identify(ids=uuid, reset=True) params = {"name": name, "status": status, diff --git a/src/tests/ftest/util/test_utils_container.py b/src/tests/ftest/util/test_utils_container.py index 3eedc1158d4f..7d68256d92bb 100644 --- a/src/tests/ftest/util/test_utils_container.py +++ b/src/tests/ftest/util/test_utils_container.py @@ -13,7 +13,7 @@ from command_utils_base import BasicParameter from exception_utils import CommandFailure from general_utils import DaosTestError, get_random_bytes -from pydaos.raw import DaosApiError, DaosContainer, DaosInputParams, c_uuid_to_str, str_to_c_uuid +from pydaos.raw import DaosApiError, DaosContainer, DaosInputParams, str_to_c_uuid from test_utils_base import TestDaosApiBase @@ -278,7 +278,6 @@ def __init__(self, pool, daos_command=None, label_generator=None): self.container = None self.uuid = None - self.info = None self.opened = False self.written_data = [] self.epoch = None @@ -565,61 +564,10 @@ def destroy(self, force=1): self.container = None self.uuid = None - self.info = None self.written_data = [] return status - @fail_on(DaosApiError) - def get_info(self, coh=None): - """Query the container for information. - - Sets the self.info attribute. - - Args: - coh (str, optional): container handle override. Defaults to None. - - """ - if self.container: - self.open() - self.log.info("Querying container %s", str(self)) - self._call_method(self.container.query, {"coh": coh}) - self.info = self.container.info - - def check_container_info(self, ci_uuid=None, ci_nsnapshots=None, ci_nhandles=None): - # pylint: disable=unused-argument - """Check the container info attributes. - - Note: - Arguments may also be provided as a string with a number preceded - by '<', '<=', '>', or '>=' for other comparisons besides the - default '=='. - - Args: - ci_uuid (str, optional): container uuid. Defaults to None. - ci_nsnapshots (int, optional): number of snapshots. - Defaults to None. - - Note: - Arguments may also be provided as a string with a number preceded - by '<', '<=', '>', or '>=' for other comparisons besides the - default '=='. - - Returns: - bool: True if at least one expected value is specified and all the - specified values match; False otherwise - - """ - self.get_info() - checks = [ - (key, - c_uuid_to_str(getattr(self.info, key)) - if key == "ci_uuid" else getattr(self.info, key), - val) - for key, val in list(locals().items()) - if key != "self" and val is not None] - return self._check_info(checks) - def write_objects(self, rank=None, obj_class=None): """Write objects to the container. @@ -1036,6 +984,24 @@ def query(self, *args, **kwargs): return self.daos.container_query( pool=self.pool.identifier, cont=self.identifier, *args, **kwargs) + def verify_query(self, expected_response): + """Verify daos container query returns expected response values. + + Args: + expected_response (dict): expected response values + + Returns: + bool: whether response values from daos container query match expected values + + """ + response = self.query()['response'] + for expected_key, expected_val in expected_response.items(): + if expected_key not in response: + return False + if response[expected_key] != expected_val: + return False + return True + def set_attr(self, *args, **kwargs): """Call daos container set-attr. diff --git a/src/tests/ftest/util/test_utils_pool.py b/src/tests/ftest/util/test_utils_pool.py index 421c988a0ea6..48e79d619941 100644 --- a/src/tests/ftest/util/test_utils_pool.py +++ b/src/tests/ftest/util/test_utils_pool.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -112,7 +112,7 @@ def get_size_params(pool): "nvme_size": pool.nvme_per_rank} -def check_pool_creation(test, pools, max_duration, offset=1, durations=None): +def check_pool_creation(test, pools, max_duration, offset=1, durations=None, minimum=None): """Check the duration of each pool creation meets the requirement. Args: @@ -122,22 +122,45 @@ def check_pool_creation(test, pools, max_duration, offset=1, durations=None): offset (int, optional): pool index offset. Defaults to 1. durations (list, optional): list of other pool create durations to include in the check. Defaults to None. + minimum (int, optional): minimum number of pools that must be created if specified. + + Returns: + list: list of created pools. """ + der_nospace_str = "DER_NOSPACE(-1007)" + if durations is None: durations = [] - for index, pool in enumerate(pools): - durations.append(time_pool_create(test.log, index + offset, pool)) - exceeding_duration = 0 - for index, duration in enumerate(durations): - if duration > max_duration: - exceeding_duration += 1 - - if exceeding_duration: + for index, pool in enumerate(pools): + try: + duration = time_pool_create(test.log, index + offset, pool) + if duration > max_duration: + test.log.debug( + "Creating pool %s took longer than expected: max=%i, got=%f", + pool, max_duration, duration) + exceeding_duration += 1 + except TestFail as error: + if minimum is None: + raise error + if der_nospace_str not in str(error): + test.fail(f'"Unexpected error occurred: wait="{der_nospace_str}", got="{error}"') + if index < minimum: + test.fail(f'Minimum pool quantity ({index}/{minimum}) not reached: {error}') + + test.log.info( + "Quantity of pools created lower than expected: wait=%i, min=%i, got=%i", + len(pools), minimum, index) + pools = pools[:index] + break + + if exceeding_duration > 0: test.fail( "Pool creation took longer than {} seconds on {} pool(s)".format( max_duration, exceeding_duration)) + return pools + def time_pool_create(log, number, pool): """Time how long it takes to create a pool. @@ -354,7 +377,7 @@ def create(self): self.pool = TestPool(self.context, DmgCommand(self.bin)) If it wants to use --nsvc option, it needs to set the value to - svcn.value. Otherwise, 1 is used. If it wants to use --group, it needs + svcn.value. If it wants to use --group, it needs to set groupname.value. If it wants to use --user, it needs to set username.value. If it wants to add other options, directly set it to self.dmg.action_command. Refer dmg_utils.py pool_create method for diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c index 7800ed7d2c98..d4d30ef7b051 100644 --- a/src/tests/suite/daos_base_tx.c +++ b/src/tests/suite/daos_base_tx.c @@ -704,6 +704,9 @@ dtx_resend_delay(test_arg_t *arg, daos_oclass_id_t oclass) daos_fail_loc_set(0); dtx_set_fail_loc(arg, 0); + /* Wait for the former delayed RPC before destroying the container to avoid DER_BUSY. */ + sleep(2); + D_FREE(update_buf); D_FREE(fetch_buf); ioreq_fini(&req); @@ -941,9 +944,9 @@ static const struct CMUnitTest dtx_tests[] = { {"DTX19: DTX resend during bulk data transfer - multiple reps", dtx_19, NULL, test_case_teardown}, {"DTX20: race between DTX refresh and DTX resync", - dtx_20, dtx_base_rf1_setup, test_case_teardown}, + dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown}, {"DTX21: do not abort partially committed DTX", - dtx_21, dtx_base_rf0_setup, test_case_teardown}, + dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown}, }; static int diff --git a/src/tests/suite/daos_checksum.c b/src/tests/suite/daos_checksum.c index f574635c3060..12757f5ce4de 100644 --- a/src/tests/suite/daos_checksum.c +++ b/src/tests/suite/daos_checksum.c @@ -2885,7 +2885,7 @@ run_daos_checksum_test(int rank, int size, int *sub_tests, int sub_tests_size) } if (sub_tests_size == 0) { - if (getenv("DAOS_CSUM_TEST_ALL_TYPE")) { + if (d_isenv_def("DAOS_CSUM_TEST_ALL_TYPE")) { for (i = DAOS_PROP_CO_CSUM_OFF + 1; i <= DAOS_PROP_CO_CSUM_ADLER32; i++) { dts_csum_prop_type = i; diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c index c0f96859f7ff..d8610ff373f5 100644 --- a/src/tests/suite/daos_container.c +++ b/src/tests/suite/daos_container.c @@ -619,6 +619,8 @@ co_op_retry(void **state) const char *label1 = "co_op_retry_cont_fi_pass"; const char *label2 = "co_op_retry_cont_fi_fail"; daos_handle_t coh; + daos_pool_info_t pinfo; + d_rank_t leader_rank; daos_cont_info_t info; char const *const names[] = {"TestAttrName0", "TestAttrName1"}; void const *const in_values[] = {"TestAttrValue0", "TestAttrValue1"}; @@ -634,6 +636,15 @@ co_op_retry(void **state) if (arg->myrank != 0) return; + print_message("querying pool info... "); + memset(&pinfo, 'D', sizeof(info)); + pinfo.pi_bits = DPI_ALL; + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; + print_message("success\n"); + print_message("first leader rank=%d\n", leader_rank); + print_message("creating container ..."); rc = daos_cont_create(arg->pool.poh, &uuid, NULL, NULL); assert_rc_equal(rc, 0); @@ -645,19 +656,19 @@ co_op_retry(void **state) assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_CONT_QUERY_FAIL_CORPC | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_CONT_QUERY_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("querying container (corpc failure, retry RPC) ... "); rc = daos_cont_query(coh, &info, NULL, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_CONT_CLOSE_FAIL_CORPC | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_CONT_CLOSE_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("closing container (corpc failure, retry RPC) ... "); rc = daos_cont_close(coh, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_CONT_DESTROY_FAIL_CORPC | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_CONT_DESTROY_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("destroying container (corpc failure, retry RPC) ... "); rc = daos_cont_destroy(arg->pool.poh, str, 1 /* force */, NULL); assert_rc_equal(rc, 0); @@ -665,48 +676,42 @@ co_op_retry(void **state) /* fault inject a timeout reply after successful handling; rpc retry sees success. */ - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("creating container %s (retry / dup rpc detection) ... ", label1); rc = daos_cont_create_with_label(arg->pool.poh, label1, NULL, &uuid, NULL); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * cont_op_save() - */ - assert_rc_equal(rc, -DER_EXIST); + assert_rc_equal(rc, 0); print_message("success, created container: " DF_UUID "\n", DP_UUID(uuid)); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("opening container %s (retry / dup rpc detection) ... ", label1); uuid_unparse(uuid, str); rc = daos_cont_open(arg->pool.poh, label1, DAOS_COO_RW, &coh, &info, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("setting container attributes (retry / dup rpc detection)... "); rc = daos_cont_set_attr(coh, n, names, in_values, in_sizes, NULL /* ev */); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("deleting container attributes (retry / dup rpc detection)... "); rc = daos_cont_del_attr(coh, n, names, NULL /* ev */); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("creating snapshot on container (retry / dup rpc detection)... "); rc = daos_cont_create_snap(coh, &epoch, NULL, NULL /* ev */); assert_rc_equal(rc, 0); print_message("success, epoch= " DF_X64 "\n", epoch); epr.epr_lo = epr.epr_hi = epoch; - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("destroying snapshot on container (retry / dup rpc detection)... "); rc = daos_cont_destroy_snap(coh, epr, NULL /* ev */); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * cont_op_save() - */ - assert_rc_equal(rc, -DER_NONEXIST); + assert_rc_equal(rc, 0); print_message("success\n"); ace = daos_ace_create(DAOS_ACL_EVERYONE, NULL); @@ -716,19 +721,16 @@ co_op_retry(void **state) acl = daos_acl_create(&ace, 1); assert_non_null(acl); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("update container ACL (retry / dup rpc detection)... "); rc = daos_cont_update_acl(coh, acl, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("delete container ACL (retry / dup rpc detection)... "); rc = daos_cont_delete_acl(coh, DAOS_ACL_EVERYONE, NULL, NULL); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * cont_op_save() - */ - assert_rc_equal(rc, -DER_NONEXIST); + assert_rc_equal(rc, 0); print_message("success\n"); prop = daos_prop_alloc(1); @@ -736,137 +738,201 @@ co_op_retry(void **state) prop->dpp_entries[0].dpe_type = DAOS_PROP_CO_SNAPSHOT_MAX; prop->dpp_entries[0].dpe_val = 1023; - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("set container property (retry / dup rpc detection)... "); rc = daos_cont_set_prop(coh, prop, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("closing container (retry / dup rpc detection) ... "); rc = daos_cont_close(coh, NULL); assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + /* cont open success committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("open container %s (new leader / retry / dup rpc detection)... ", label1); + rc = daos_cont_open(arg->pool.poh, label1, DAOS_COO_RW, &coh, &info, NULL); + assert_rc_equal(rc, 0); + print_message("success\n"); + + print_message("querying pool info for new leader ... "); + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; + print_message("success\n"); + print_message("new leader rank=%d\n", leader_rank); + + /* cont close success committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("closing container (new leader / retry / dup rpc detection)... "); + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + print_message("success\n"); + + print_message("querying pool info for new leader ... "); + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; + print_message("success\n"); + print_message("new leader rank=%d\n", leader_rank); + + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); print_message("destroying container %s (retry / dup rpc detection) ... ", label1); rc = daos_cont_destroy(arg->pool.poh, label1, 1 /* force */, NULL); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * cont_op_save() - */ - assert_rc_equal(rc, -DER_NONEXIST); + assert_rc_equal(rc, 0); print_message("success\n"); /* fault inject a timeout reply after failed handling; rpc retry sees failure. */ - /* FIXME: DAOS-14020, change to expect rc == -DER_MISC for all DAOS_MD_OP_FAIL_NOREPLY - * cases when dup detection is enabled in cont_op_save() - */ - - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); - print_message("test-fail creating container %s (retry / dup rpc detection) ... ", label1); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + print_message("test-fail creating container %s (retry / dup rpc detection) ... ", label2); rc = daos_cont_create_with_label(arg->pool.poh, label2, NULL, &uuid, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); print_message("creating container %s ... ", label2); rc = daos_cont_create_with_label(arg->pool.poh, label2, NULL, &uuid, NULL); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * cont_op_save() - */ - assert_rc_equal(rc, -DER_EXIST); + assert_rc_equal(rc, 0); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail opening container %s (retry / dup rpc detection) ... ", label2); uuid_unparse(uuid, str); rc = daos_cont_open(arg->pool.poh, label2, DAOS_COO_RW, &coh, &info, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); -#if 0 - /* FIXME: DAOS-14020, enable this code when dup detection enabled in cont_op_save() */ print_message("opening container %s ... ", label2); rc = daos_cont_open(arg->pool.poh, label2, DAOS_COO_RW, &coh, &info, NULL); assert_rc_equal(rc, 0); print_message("success\n"); -#endif - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail setting container attributes (retry / dup rpc detection)... "); rc = daos_cont_set_attr(coh, n, names, in_values, in_sizes, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail deleting container attributes (retry / dup rpc detection)... "); rc = daos_cont_del_attr(coh, n, names, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail creating snapshot on container (retry / dup rpc detection)... "); rc = daos_cont_create_snap(coh, &epoch, NULL, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); -#if 0 - /* FIXME: DAOS-14020, enable this code when dup detection enabled in cont_op_save() */ print_message("creating snapshot on container (retry / dup rpc detection)... "); rc = daos_cont_create_snap(coh, &epoch, NULL, NULL /* ev */); assert_rc_equal(rc, 0); print_message("success\n"); -#endif epr.epr_lo = epr.epr_hi = epoch; - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail destroying snapshot on container (retry / dup rpc detection)... "); rc = daos_cont_destroy_snap(coh, epr, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail update container ACL (retry / dup rpc detection)... "); rc = daos_cont_update_acl(coh, acl, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail delete container ACL (retry / dup rpc detection)... "); rc = daos_cont_delete_acl(coh, DAOS_ACL_EVERYONE, NULL, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail set container property (retry / dup rpc detection)... "); rc = daos_cont_set_prop(coh, prop, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail closing container (retry / dup rpc detection) ... "); rc = daos_cont_close(coh, NULL); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); print_message("closing container ... "); rc = daos_cont_close(coh, NULL); - assert_rc_equal(rc, -DER_NO_HDL); + assert_rc_equal(rc, 0); + print_message("success\n"); + + /* cont open fail committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("test-fail open container %s (new leader / retry / dup rpc detection)... ", + label2); + rc = daos_cont_open(arg->pool.poh, label2, DAOS_COO_RW, &coh, &info, NULL); + assert_rc_equal(rc, -DER_MISC); + print_message("success\n"); + + print_message("querying pool info for new leader ... "); + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; + print_message("success\n"); + print_message("new leader rank=%d\n", leader_rank); + + print_message("open container %s ...", label2); + rc = daos_cont_open(arg->pool.poh, label2, DAOS_COO_RW, &coh, &info, NULL); + assert_rc_equal(rc, 0); + print_message("success\n"); + + /* cont close fail committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("test-fail close container %s (new leader / retry / dup rpc detection)... ", + label2); + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); - test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); + print_message("querying pool info for new leader ... "); + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; + print_message("success\n"); + print_message("new leader rank=%d\n", leader_rank); + + print_message("close container ..."); + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + print_message("success\n"); + + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail destroying container %s (retry / dup rpc detection) ... ", label2); rc = daos_cont_destroy(arg->pool.poh, label2, 1 /* force */, NULL); + assert_rc_equal(rc, -DER_MISC); + print_message("success\n"); + + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("test-fail destroying container %s (new leader / retry / dup rpc detection) " + "... ", + label2); + rc = daos_cont_destroy(arg->pool.poh, label2, 1 /* force */, NULL); + assert_rc_equal(rc, -DER_MISC); + print_message("success\n"); + + print_message("querying pool info for new leader ... "); + rc = daos_pool_query(arg->pool.poh, NULL, &pinfo, NULL, NULL /* ev */); assert_rc_equal(rc, 0); + leader_rank = pinfo.pi_leader; print_message("success\n"); + print_message("final leader rank=%d\n", leader_rank); -#if 0 - /* FIXME: DAOS-14020, enable this code when dup detection enabled in cont_op_save() */ print_message("destroying container %s ... ", label2); rc = daos_cont_destroy(arg->pool.poh, label2, 1 /* force */, NULL); assert_rc_equal(rc, 0); print_message("success\n"); -#endif daos_acl_free(acl); daos_ace_free(ace); diff --git a/src/tests/suite/daos_nvme_recovery.c b/src/tests/suite/daos_nvme_recovery.c index a9c2f2a0a82c..f84556b56b23 100644 --- a/src/tests/suite/daos_nvme_recovery.c +++ b/src/tests/suite/daos_nvme_recovery.c @@ -87,9 +87,10 @@ nvme_fault_reaction(void **state, int mode) daos_size_t nvme_size; /* Use the SCM size if set with environment */ - env = getenv("POOL_SCM_SIZE"); + d_agetenv_str(&env, "POOL_SCM_SIZE"); if (env) { size_gb = atoi(env); + d_freeenv_str(&env); if (size_gb != 0) scm_size = (daos_size_t)size_gb << 30; } @@ -723,23 +724,6 @@ nvme_test_simulate_IO_error(void **state) print_message("Final read_errors = %s\n", check_errors); assert_true(atoi(check_errors) == atoi(read_errors) + 1); - /* - * Verify writeErr=true and readErr:true available in control log - */ - char control_err[][50] = { - "detected blob I/O error! writeErr:true", - "detected blob I/O error! readErr:true"}; - for (i = 0; i < 2 ; i++) { - rc = verify_state_in_log(devices[rank_pos].host, - control_log_file, control_err[i]); - if (rc != 0) { - print_message( - " %s not found in log %s\n", control_err[i], - control_log_file); - assert_rc_equal(rc, 0); - } - } - /* Tear down */ D_FREE(ow_buf); D_FREE(fbuf); diff --git a/src/tests/suite/daos_pool.c b/src/tests/suite/daos_pool.c index 10144c00c1ff..dc2f7c47cd9c 100644 --- a/src/tests/suite/daos_pool.c +++ b/src/tests/suite/daos_pool.c @@ -493,14 +493,13 @@ pool_properties(void **state) { test_arg_t *arg0 = *state; test_arg_t *arg = NULL; - char label[] = "test_pool_properties"; -#if 0 /* DAOS-5456 space_rb props not supported with dmg pool create */ - uint64_t space_rb = 36; -#endif + char label[] = "test_pool_properties"; + uint64_t space_rb = 36; daos_prop_t *prop = NULL; daos_prop_t *prop_query; struct daos_prop_entry *entry; daos_pool_info_t info = {0}; + const uint64_t svc_ops_age = 180; int rc; char *expected_owner; char *expected_group; @@ -512,7 +511,7 @@ pool_properties(void **state) SMALL_POOL_SIZE, 0, NULL); assert_rc_equal(rc, 0); - prop = daos_prop_alloc(2); + prop = daos_prop_alloc(5); /* label - set arg->pool_label to use daos_pool_connect() */ prop->dpp_entries[0].dpe_type = DAOS_PROP_PO_LABEL; D_STRNDUP_S(prop->dpp_entries[0].dpe_str, label); @@ -523,11 +522,14 @@ pool_properties(void **state) prop->dpp_entries[1].dpe_type = DAOS_PROP_PO_SCRUB_MODE; prop->dpp_entries[1].dpe_val = DAOS_SCRUB_MODE_TIMED; -#if 0 /* DAOS-5456 space_rb props not supported with dmg pool create */ - /* change daos_prop_alloc() above, specify 2 entries not 1 */ - prop->dpp_entries[1].dpe_type = DAOS_PROP_PO_SPACE_RB; - prop->dpp_entries[1].dpe_val = space_rb; -#endif + prop->dpp_entries[2].dpe_type = DAOS_PROP_PO_SVC_OPS_ENABLED; + prop->dpp_entries[2].dpe_val = 0; /* disabled */ + + prop->dpp_entries[3].dpe_type = DAOS_PROP_PO_SVC_OPS_ENTRY_AGE; + prop->dpp_entries[3].dpe_val = svc_ops_age; /* seconds */ + + prop->dpp_entries[4].dpe_type = DAOS_PROP_PO_SPACE_RB; + prop->dpp_entries[4].dpe_val = space_rb; while (!rc && arg->setup_state != SETUP_POOL_CONNECT) rc = test_setup_next_step((void **)&arg, NULL, prop, NULL); @@ -550,12 +552,26 @@ pool_properties(void **state) if (entry == NULL || strcmp(entry->dpe_str, label) != 0) { fail_msg("label verification failed.\n"); } -#if 0 /* DAOS-5456 space_rb props not supported with dmg pool create */ + + entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SCRUB_MODE); + if (entry == NULL || (entry->dpe_val != DAOS_SCRUB_MODE_TIMED)) { + fail_msg("scrub_mode verification failed.\n"); + } + + entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SVC_OPS_ENABLED); + if (entry == NULL || (entry->dpe_val != 0)) { + fail_msg("svc_ops_enabled verification failed.\n"); + } + + entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SVC_OPS_ENTRY_AGE); + if (entry == NULL || (entry->dpe_val != svc_ops_age)) { + fail_msg("svc_ops_entry_age verification failed.\n"); + } + entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SPACE_RB); if (entry == NULL || entry->dpe_val != space_rb) { fail_msg("space_rb verification failed.\n"); } -#endif /* not set properties should get default value */ entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SELF_HEAL); if (entry == NULL || @@ -594,9 +610,11 @@ pool_properties(void **state) fail_msg("Owner-group prop verification failed.\n"); } +#if 0 entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SCRUB_MODE); if (entry == NULL || entry->dpe_val != DAOS_SCRUB_MODE_OFF) fail_msg("scrubber sched verification failed.\n"); +#endif entry = daos_prop_entry_get(prop_query, DAOS_PROP_PO_SCRUB_FREQ); if (entry == NULL) { @@ -637,6 +655,10 @@ pool_op_retry(void **state) if (arg->myrank != 0) return; + /* Not allowed to set the (test-only) property svc_ops_enabled on existing pool */ + rc = daos_pool_set_prop(arg->pool.pool_uuid, "svc_ops_enabled", "0"); + assert_rc_equal(rc, -DER_NO_PERM); + /* pool connect/query/disconnect failing corpcs - non-duplicate-RPC retries */ test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_POOL_CONNECT_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("connecting to pool ... "); @@ -647,6 +669,7 @@ pool_op_retry(void **state) assert_int_equal(info.pi_ndisabled, 0); leader_rank = info.pi_leader; print_message("success\n"); + print_message("first leader rank=%d\n", leader_rank); test_set_engine_fail_loc(arg, leader_rank, DAOS_POOL_QUERY_FAIL_CORPC | DAOS_FAIL_ONCE); print_message("querying pool info... "); @@ -704,10 +727,7 @@ pool_op_retry(void **state) print_message("delete pool ACL with principal=%s (retry / dup rpc detection)... ", principal); rc = dmg_pool_delete_ace(arg->dmg_config, arg->pool.pool_uuid, arg->group, principal); - /* FIXME: DAOS-14020, change to expect rc == 0 when dup detection enabled in - * pool_op_save() - */ - assert_rc_equal(rc, -DER_NONEXIST); + assert_rc_equal(rc, 0); print_message("success\n"); /* pool set prop success committed, "lost" reply - duplicate RPC retry */ @@ -738,16 +758,21 @@ pool_op_retry(void **state) assert_rc_equal(rc, 0); print_message("success\n"); - /* FIXME: DAOS-14020, change to expect rc == -DER_MISC for all DAOS_MD_OP_FAIL_NOREPLY - * cases when dup detection is enabled in pool_op_save() - */ + /* TODO: implement dup op detection in ds_pool_update_handler()? */ +#if 0 + test_set_engine_fail_loc(arg, info.pi_leader, DAOS_MD_OP_PASS_NOREPLY | DAOS_FAIL_ONCE); + print_message("draining rank %d target idx 0 ... ", info.pi_leader); + rc = dmg_pool_drain(arg->dmg_config, arg->pool.pool_uuid, arg->group, info.pi_leader, 0); + assert_rc_equal(rc, 0); + print_message("success\n"); +#endif /* pool connect failure committed, "lost" reply - duplicate RPC retry */ test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail to connect to pool (retry / dup rpc detection)... "); rc = daos_pool_connect(arg->pool.pool_str, arg->group, DAOS_PC_RW, &poh, &info, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* get a real handle for the subsequent fault injection steps below */ @@ -761,7 +786,7 @@ pool_op_retry(void **state) test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail to set pool attributes (retry / dup rpc detection)... "); rc = daos_pool_set_attr(poh, n, names, in_values, in_sizes, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool delete attributes failure committed, "lost" reply - duplicate RPC retry */ @@ -769,7 +794,7 @@ pool_op_retry(void **state) print_message("test-fail to delete pool attributes (retry / dup rpc detection)... "); rc = daos_pool_del_attr(poh, n, names, NULL /* ev */); fflush(stdout); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool update ACL entry failure committed, "lost" reply - duplicate RPC retry */ @@ -777,7 +802,7 @@ pool_op_retry(void **state) print_message("test-fail update pool ACL with entry=%s (retry / dup rpc detection)... ", ace); rc = dmg_pool_update_ace(arg->dmg_config, arg->pool.pool_uuid, arg->group, ace); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool delete ACL entry failure committed, "lost" reply - duplicate RPC retry */ @@ -785,35 +810,34 @@ pool_op_retry(void **state) print_message("test-fail delete pool ACL with principal=%s (retry / dup rpc detection)... ", principal); rc = dmg_pool_delete_ace(arg->dmg_config, arg->pool.pool_uuid, arg->group, principal); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool set prop failure committed, "lost" reply - duplicate RPC retry */ test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail set pool prop (retry / dup rpc detection)... "); rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal", "rebuild"); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool evict failure committed, "lost" reply - duplicate RPC retry */ test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail to evict pool handles (retry / dup rpc detection)... "); rc = dmg_pool_evict(arg->dmg_config, arg->pool.pool_uuid, arg->group); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* pool disconnect failure committed, "lost" reply - duplicate RPC retry */ test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY | DAOS_FAIL_ONCE); print_message("test-fail to disconnect from pool (retry / dup rpc detection)... "); rc = daos_pool_disconnect(poh, NULL /* ev */); - assert_rc_equal(rc, 0); + assert_rc_equal(rc, -DER_MISC); print_message("success\n"); /* disconnect the real handle */ print_message("disconnecting from pool... "); rc = daos_pool_disconnect(poh, NULL /* ev */); - /* FIXME: DAOS-14020: expect rc == 0 when dup op detection is enabled */ - assert_rc_equal(rc, -DER_NO_HDL); + assert_rc_equal(rc, 0); print_message("success\n"); /* TODO: implement dup op detection in ds_pool_update_handler()? */ @@ -825,6 +849,43 @@ pool_op_retry(void **state) print_message("success\n"); #endif + /* pool connect success committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_PASS_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("connect to pool (new leader / retry / dup rpc detection)... "); + rc = daos_pool_connect(arg->pool.pool_str, arg->group, DAOS_PC_RW, &poh, &info, + NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = info.pi_leader; + print_message("success\n"); + print_message("new leader rank=%d\n", leader_rank); + + print_message("disconnecting from pool... "); + rc = daos_pool_disconnect(poh, NULL /* ev */); + assert_rc_equal(rc, 0); + print_message("success\n"); + + /* pool connect fail committed, "lost" reply, leader change - duplicate RPC retry */ + test_set_engine_fail_loc(arg, leader_rank, DAOS_MD_OP_FAIL_NOREPLY_NEWLDR | DAOS_FAIL_ONCE); + print_message("test-fail to connect to pool (new leader / retry / dup rpc detection)... "); + rc = daos_pool_connect(arg->pool.pool_str, arg->group, DAOS_PC_RW, &poh, &info, + NULL /* ev */); + assert_rc_equal(rc, -DER_MISC); + print_message("success\n"); + + /* get a real handle from the new leader */ + print_message("connecting to pool... "); + rc = daos_pool_connect(arg->pool.pool_str, arg->group, DAOS_PC_RW, &poh, &info, + NULL /* ev */); + assert_rc_equal(rc, 0); + leader_rank = info.pi_leader; + print_message("success\n"); + print_message("final leader rank=%d\n", leader_rank); + + print_message("disconnecting from pool... "); + rc = daos_pool_disconnect(poh, NULL /* ev */); + assert_rc_equal(rc, 0); + print_message("success\n"); + test_set_engine_fail_loc(arg, CRT_NO_RANK, 0); } diff --git a/src/tests/suite/daos_test_common.c b/src/tests/suite/daos_test_common.c index 7de287713ce3..ac7e5c2015a4 100644 --- a/src/tests/suite/daos_test_common.c +++ b/src/tests/suite/daos_test_common.c @@ -70,9 +70,10 @@ test_setup_pool_create(void **state, struct test_pool *ipool, daos_size_t nvme_size; d_rank_list_t *rank_list = NULL; - env = getenv("POOL_SCM_SIZE"); + d_agetenv_str(&env, "POOL_SCM_SIZE"); if (env) { size_gb = atoi(env); + d_freeenv_str(&env); if (size_gb != 0) outpool->pool_size = (daos_size_t)size_gb << 30; @@ -85,9 +86,10 @@ test_setup_pool_create(void **state, struct test_pool *ipool, * Set env POOL_NVME_SIZE to overwrite the default NVMe size. */ nvme_size = outpool->pool_size * 4; - env = getenv("POOL_NVME_SIZE"); + d_agetenv_str(&env, "POOL_NVME_SIZE"); if (env) { size_gb = atoi(env); + d_freeenv_str(&env); nvme_size = (daos_size_t)size_gb << 30; } diff --git a/src/tests/suite/daos_verify_consistency.c b/src/tests/suite/daos_verify_consistency.c index f5f03ddc1af5..82b4979bebed 100644 --- a/src/tests/suite/daos_verify_consistency.c +++ b/src/tests/suite/daos_verify_consistency.c @@ -352,8 +352,7 @@ vc_9(void **state) oid = daos_test_oid_gen(arg->coh, dts_vc_class, 0, 0, arg->myrank); ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); - vc_gen_modifications(arg, &req, oid, 7, 7, 7, - DAOS_VC_DIFF_DKEY, 0, 0); + vc_gen_modifications(arg, &req, oid, 7, 7, 7, DAOS_VC_DIFF_DKEY | DAOS_FAIL_ALWAYS, 0, 0); rc = vc_obj_verify(arg, oid); assert_rc_equal(rc, -DER_MISMATCH); diff --git a/src/tests/suite/dfs_test.c b/src/tests/suite/dfs_test.c index 217f30ad1780..29e37a0b759e 100644 --- a/src/tests/suite/dfs_test.c +++ b/src/tests/suite/dfs_test.c @@ -168,14 +168,16 @@ main(int argc, char **argv) } /** if writing XML, force all ranks other than rank 0 to use stdout to avoid conflicts */ - cmocka_message_output = getenv("CMOCKA_MESSAGE_OUTPUT"); + d_agetenv_str(&cmocka_message_output, "CMOCKA_MESSAGE_OUTPUT"); if (rank != 0 && cmocka_message_output && strcasecmp(cmocka_message_output, "xml") == 0) { + d_freeenv_str(&cmocka_message_output); rc = d_setenv("CMOCKA_MESSAGE_OUTPUT", "stdout", 1); if (rc) { print_message("d_setenv() failed with %d\n", rc); return -1; } } + d_freeenv_str(&cmocka_message_output); nr_failed = run_specified_tests(tests, rank, size, NULL, 0); diff --git a/src/tests/suite/dfs_unit_test.c b/src/tests/suite/dfs_unit_test.c index 6c2bf8fe1bbf..cc3091c3fe3f 100644 --- a/src/tests/suite/dfs_unit_test.c +++ b/src/tests/suite/dfs_unit_test.c @@ -1415,15 +1415,45 @@ dfs_test_chown(void **state) char *filename_file2 = "open_stat2"; mode_t create_mode = S_IWUSR | S_IRUSR; int create_flags = O_RDWR | O_CREAT | O_EXCL; + struct timespec ctime_orig, mtime_orig; + mode_t orig_mode; int rc; if (arg->myrank != 0) return; - rc = dfs_lookup(dfs_mt, "/", O_RDWR, &dir, NULL, &stbuf); + rc = dfs_lookup(dfs_mt, "/", O_RDWR, &dir, &orig_mode, &stbuf); assert_int_equal(rc, 0); assert_int_equal(stbuf.st_uid, geteuid()); assert_int_equal(stbuf.st_gid, getegid()); + mtime_orig.tv_sec = stbuf.st_mtim.tv_sec; + mtime_orig.tv_nsec = stbuf.st_mtim.tv_nsec; + ctime_orig.tv_sec = stbuf.st_ctim.tv_sec; + ctime_orig.tv_nsec = stbuf.st_ctim.tv_nsec; + + /** chown of root and see if visible */ + print_message("Running chown tests on root object...\n"); + memset(&stbuf, 0, sizeof(stbuf)); + stbuf.st_uid = 3; + stbuf.st_gid = 4; + stbuf.st_mtim.tv_sec = mtime_orig.tv_sec + 10; + stbuf.st_mtim.tv_nsec = mtime_orig.tv_nsec; + stbuf.st_mode = orig_mode | S_IROTH | S_IWOTH | S_IXOTH; + rc = dfs_osetattr(dfs_mt, dir, &stbuf, + DFS_SET_ATTR_UID | DFS_SET_ATTR_GID | DFS_SET_ATTR_MTIME | + DFS_SET_ATTR_MODE); + assert_int_equal(rc, 0); + rc = dfs_release(dir); + assert_int_equal(rc, 0); + + memset(&stbuf, 0, sizeof(stbuf)); + rc = dfs_lookup(dfs_mt, "/", O_RDWR, &dir, NULL, &stbuf); + assert_int_equal(rc, 0); + assert_int_equal(stbuf.st_mode, orig_mode | S_IROTH | S_IWOTH | S_IXOTH); + assert_int_equal(stbuf.st_uid, 3); + assert_int_equal(stbuf.st_gid, 4); + assert_true(check_ts(ctime_orig, stbuf.st_ctim)); + assert_int_equal(mtime_orig.tv_sec + 10, stbuf.st_mtim.tv_sec); rc = dfs_release(dir); assert_int_equal(rc, 0); @@ -1495,6 +1525,11 @@ run_time_tests(dfs_obj_t *obj, char *name, int mode) struct timespec prev_ts, first_ts; daos_size_t size; dfs_obj_t *tmp_obj; + struct tm tm = {0}; + time_t ts; + char *p; + struct tm *timeptr; + char time_str[64]; int rc; rc = dfs_stat(dfs_mt, NULL, name, &stbuf); @@ -1582,8 +1617,34 @@ run_time_tests(dfs_obj_t *obj, char *name, int mode) prev_ts.tv_sec = stbuf.st_mtim.tv_sec; prev_ts.tv_nsec = stbuf.st_mtim.tv_nsec; - /** set size on file with dfs_osetattr and stat at same time */ if (S_ISREG(mode)) { + /** set mtime and size at the same time; mtime should be what we set */ + memset(&stbuf, 0, sizeof(stbuf)); + stbuf.st_size = 1000; + p = strptime("2023-12-31", "%Y-%m-%d", &tm); + assert_non_null(p); + ts = mktime(&tm); + stbuf.st_mtim.tv_sec = ts; + stbuf.st_mtim.tv_nsec = 0; + rc = dfs_osetattr(dfs_mt, obj, &stbuf, DFS_SET_ATTR_SIZE | DFS_SET_ATTR_MTIME); + assert_int_equal(rc, 0); + assert_int_equal(stbuf.st_size, 1000); + /** check the mtime was updated with the setattr */ + assert_int_equal(ts, stbuf.st_mtim.tv_sec); + timeptr = localtime(&stbuf.st_mtim.tv_sec); + strftime(time_str, sizeof(time_str), "%Y-%m-%d", timeptr); + print_message("mtime = %s\n", time_str); + assert_true(strncmp("2023", time_str, 4) == 0); + + memset(&stbuf, 0, sizeof(stbuf)); + rc = dfs_ostat(dfs_mt, obj, &stbuf); + assert_int_equal(rc, 0); + assert_int_equal(stbuf.st_size, 1000); + timeptr = localtime(&stbuf.st_mtim.tv_sec); + strftime(time_str, sizeof(time_str), "%Y-%m-%d", timeptr); + assert_int_equal(ts, stbuf.st_mtim.tv_sec); + assert_true(strncmp("2023", time_str, 4) == 0); + memset(&stbuf, 0, sizeof(stbuf)); stbuf.st_size = 1024; rc = dfs_osetattr(dfs_mt, obj, &stbuf, DFS_SET_ATTR_SIZE); @@ -1593,12 +1654,6 @@ run_time_tests(dfs_obj_t *obj, char *name, int mode) assert_true(check_ts(prev_ts, stbuf.st_mtim)); } - struct tm tm = {0}; - time_t ts; - char *p; - struct tm *timeptr; - char time_str[64]; - /** set the mtime to 2020 */ p = strptime("2020-12-31", "%Y-%m-%d", &tm); assert_non_null(p); @@ -2503,7 +2558,7 @@ dfs_test_checker(void **state) test_arg_t *arg = *state; dfs_t *dfs; int nr = 100, i; - dfs_obj_t *root, *lf; + dfs_obj_t *root, *lf, *sym; daos_obj_id_t root_oid; daos_handle_t root_oh; daos_handle_t coh; @@ -2574,6 +2629,12 @@ dfs_test_checker(void **state) assert_int_equal(rc, 0); } + /** create a symlink with a non-existent target in the container */ + rc = dfs_open(dfs, NULL, "SL1", S_IFLNK | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT | O_EXCL, 0, + 0, "/usr/local", &sym); + assert_int_equal(rc, 0); + rc = dfs_release(sym); + rc = dfs_disconnect(dfs); assert_int_equal(rc, 0); /** have to call fini to release the cached container handle for the checker to work */ diff --git a/src/tests/suite/dfuse_test.c b/src/tests/suite/dfuse_test.c index bcff68d5ecfd..28cc137f45e9 100644 --- a/src/tests/suite/dfuse_test.c +++ b/src/tests/suite/dfuse_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2021-2023 Intel Corporation. + * (C) Copyright 2021-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -34,7 +34,7 @@ /* Tests can be run by specifying the appropriate argument for a test or all will be run if no test * is specified. */ -static const char *all_tests = "ismd"; +static const char *all_tests = "ismdl"; static void print_usage() @@ -47,6 +47,7 @@ print_usage() print_message("dfuse_test -s|--stream\n"); print_message("dfuse_test -m|--metadata\n"); print_message("dfuse_test -d|--directory\n"); + print_message("dfuse_test -l|--lowfd\n"); print_message("Default runs all tests\n=============\n"); print_message("\n=============================\n"); } @@ -146,6 +147,21 @@ do_openat(void **state) assert_return_code(rc, errno); assert_int_equal(stbuf.st_size, stbuf0.st_size); + /* cornercase: fd for a regular file is passed into fstatat(). Path is empty. */ + rc = fstatat(fd, "", &stbuf0, AT_EMPTY_PATH); + assert_return_code(rc, errno); + assert_int_equal(stbuf.st_size, stbuf0.st_size); + + /* expected to fail */ + rc = fstatat(fd, "", &stbuf0, 0); + assert_int_equal(rc, -1); + assert_int_equal(errno, ENOENT); + + /* expected to fail */ + rc = fstatat(fd, "entry", &stbuf0, 0); + assert_int_equal(rc, -1); + assert_int_equal(errno, ENOTDIR); + rc = close(fd); assert_return_code(rc, errno); @@ -494,6 +510,67 @@ do_directory(void **state) assert_return_code(rc, errno); } +#define MIN_DAOS_FD 10 +/* + * Check whether daos network context uses low fds 0~9. + */ +void +do_lowfd(void **state) +{ + int fd; + int rc; + int i; + bool pil4dfs_loaded = false; + char *env_ldpreload; + char fd_path[64]; + char *path; + + env_ldpreload = getenv("LD_PRELOAD"); + if (env_ldpreload == NULL) + return; + + if (strstr(env_ldpreload, "libpil4dfs.so")) + pil4dfs_loaded = true; + else + /* libioil cannot pass this test since low fds are only temporarily blocked */ + return; + + /* first time access a dir on DFS mount to trigger daos_init() */ + fd = open(test_dir, O_PATH | O_DIRECTORY); + assert_return_code(fd, errno); + + rc = close(fd); + assert_return_code(rc, errno); + + /* open the root dir and print fd */ + fd = open("/", O_PATH | O_DIRECTORY); + assert_return_code(fd, errno); + printf("fd = %d\n", fd); + rc = close(fd); + assert_return_code(rc, errno); + if (pil4dfs_loaded) + assert_true(fd >= MIN_DAOS_FD); + + /* now check whether daos uses low fds */ + path = malloc(PATH_MAX); + assert_non_null(path); + for (i = 0; i < MIN_DAOS_FD; i++) { + snprintf(fd_path, sizeof(fd_path) - 1, "/proc/self/fd/%d", i); + rc = readlink(fd_path, path, PATH_MAX - 1); + /* libioil only temporarily block low fds during daos_init(). + * libpil4dfs blocks low fds before daos_init() and does not free + * them until applications end. + */ + if (!pil4dfs_loaded && rc == -1 && errno == ENOENT) + continue; + assert_true(rc > 0); + path[rc] = 0; + assert_true(strstr(path, "socket:") == NULL); + assert_true(strstr(path, "anon_inode:") == NULL); + } + free(path); +} + static int run_specified_tests(const char *tests, int *sub_tests, int sub_tests_size) { @@ -543,6 +620,15 @@ run_specified_tests(const char *tests, int *sub_tests, int sub_tests_size) }; nr_failed += cmocka_run_group_tests(readdir_tests, NULL, NULL); break; + case 'l': + printf("\n\n================="); + printf("dfuse low fd tests"); + printf("=====================\n"); + const struct CMUnitTest lowfd_tests[] = { + cmocka_unit_test(do_lowfd), + }; + nr_failed += cmocka_run_group_tests(lowfd_tests, NULL, NULL); + break; default: assert_true(0); @@ -568,9 +654,10 @@ main(int argc, char **argv) {"stream", no_argument, NULL, 's'}, {"metadata", no_argument, NULL, 'm'}, {"directory", no_argument, NULL, 'd'}, + {"lowfd", no_argument, NULL, 'l'}, {NULL, 0, NULL, 0}}; - while ((opt = getopt_long(argc, argv, "aM:imsd", long_options, &index)) != -1) { + while ((opt = getopt_long(argc, argv, "aM:imsdl", long_options, &index)) != -1) { if (strchr(all_tests, opt) != NULL) { tests[ntests] = opt; ntests++; diff --git a/src/utils/SConscript b/src/utils/SConscript index fa8bb06da7fa..847c1a5ac647 100644 --- a/src/utils/SConscript +++ b/src/utils/SConscript @@ -22,6 +22,7 @@ def scons(): # Build crt_launch SConscript('crt_launch/SConscript') + if prereqs.server_requested(): # Build daos_metrics SConscript('daos_metrics/SConscript') diff --git a/src/utils/self_test/self_test.c b/src/utils/self_test/self_test.c index 7c9d6e592d5e..a3157dc01f20 100644 --- a/src/utils/self_test/self_test.c +++ b/src/utils/self_test/self_test.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1827,18 +1827,16 @@ int main(int argc, char *argv[]) } if (use_daos_agent_vars == false) { - char *env; char *attach_path; + char *attach_path_env = NULL; - env = getenv("CRT_PHY_ADDR_STR"); - if (env == NULL) { + if (!d_isenv_def("CRT_PHY_ADDR_STR")) { printf("Error: provider (CRT_PHY_ADDR_STR) is not set\n"); printf("Example: export CRT_PHY_ADDR_STR='ofi+tcp'\n"); D_GOTO(cleanup, ret = -DER_INVAL); } - env = getenv("OFI_INTERFACE"); - if (env == NULL) { + if (!d_isenv_def("OFI_INTERFACE")) { printf("Error: interface (OFI_INTERFACE) is not set\n"); printf("Example: export OFI_INTERFACE=eth0\n"); D_GOTO(cleanup, ret = -DER_INVAL); @@ -1847,14 +1845,17 @@ int main(int argc, char *argv[]) if (attach_info_path) attach_path = attach_info_path; else { - attach_path = getenv("CRT_ATTACH_INFO_PATH"); + d_agetenv_str(&attach_path_env, "CRT_ATTACH_INFO_PATH"); + attach_path = attach_path_env; if (!attach_path) attach_path = "/tmp"; } + D_ASSERT(attach_path != NULL); printf("Warning: running without daos_agent connection (-u option); " "Using attachment file %s/%s.attach_info_tmp instead\n", attach_path, dest_name ? dest_name : default_dest_name); + d_freeenv_str(&attach_path_env); } /******************** Parse message sizes argument ********************/ diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index cf28bf0d573d..ea4cdd45c12f 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -987,7 +987,7 @@ vos_self_init(const char *db_path, bool use_sys_db, int tgt_id) goto failed; } - evt_mode = getenv("DAOS_EVTREE_MODE"); + rc = d_agetenv_str(&evt_mode, "DAOS_EVTREE_MODE"); if (evt_mode) { if (strcasecmp("soff", evt_mode) == 0) { vos_evt_feats &= ~EVT_FEATS_SUPPORTED; @@ -996,6 +996,7 @@ vos_self_init(const char *db_path, bool use_sys_db, int tgt_id) vos_evt_feats &= ~EVT_FEATS_SUPPORTED; vos_evt_feats |= EVT_FEAT_SORT_DIST_EVEN; } + d_freeenv_str(&evt_mode); } switch (vos_evt_feats & EVT_FEATS_SUPPORTED) { case EVT_FEAT_SORT_SOFF: diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index dc4e5e7c11ab..ca7f26ad5401 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -102,7 +102,7 @@ enum vos_gc_type { #define VOS_POOL_FEAT_2_4 (VOS_POOL_FEAT_CHK | VOS_POOL_FEAT_DYN_ROOT) /** 2.6 features */ -#define VOS_POOL_FEAT_2_6 (VOS_POOL_FEAT_EMB_VALUE | VOS_POOL_FEAT_FLAT_DKEY) +#define VOS_POOL_FEAT_2_6 (VOS_POOL_FEAT_FLAT_DKEY | VOS_POOL_FEAT_EMBED_FIRST) /** * Durable format for VOS pool diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index 20160a97abfd..e4a8c11fd7ae 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -103,12 +103,6 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, void vos_obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict); -static inline int -vos_obj_refcount(struct vos_object *obj) -{ - return obj->obj_llink.ll_ref; -} - /** Evict an object reference from the cache */ void vos_obj_evict(struct daos_lru_cache *occ, struct vos_object *obj); diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index 11e55e9d1560..a45777990535 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -354,7 +354,7 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, if (intent == DAOS_INTENT_KILL && !(flags & VOS_OBJ_KILL_DKEY)) { if (obj != &obj_local) { - if (vos_obj_refcount(obj) > 2) + if (!daos_lru_is_last_user(&obj->obj_llink)) D_GOTO(failed, rc = -DER_BUSY); vos_obj_evict(occ, obj); diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index 940a9839aab2..b3935514caf7 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -47,6 +47,9 @@ iov2svt_key(d_iov_t *key_iov) static struct vos_rec_bundle * iov2rec_bundle(d_iov_t *val_iov) { + if (val_iov == NULL) + return NULL; + D_ASSERT(val_iov->iov_len == sizeof(struct vos_rec_bundle)); return (struct vos_rec_bundle *)val_iov->iov_buf; } @@ -307,9 +310,20 @@ static int ktr_rec_fetch(struct btr_instance *tins, struct btr_record *rec, d_iov_t *key_iov, d_iov_t *val_iov) { + char *kbuf; struct vos_krec_df *krec = vos_rec2krec(tins, rec); struct vos_rec_bundle *rbund = iov2rec_bundle(val_iov); + /** For embedded value, we sometimes only need to fetch the key, + * to generate the hash. + */ + if (rbund == NULL) { + D_ASSERT(key_iov != NULL); + kbuf = vos_krec2key(krec); + d_iov_set(key_iov, kbuf, krec->kr_size); + return 0; + } + rbund->rb_krec = krec; if (key_iov != NULL) @@ -713,31 +727,33 @@ static btr_ops_t singv_btr_ops = { * @} vos_singv_btr */ static struct vos_btr_attr vos_btr_attrs[] = { - { - .ta_class = VOS_BTR_DKEY, - .ta_order = VOS_KTR_ORDER, - .ta_feats = BTR_FEAT_UINT_KEY | BTR_FEAT_DIRECT_KEY | BTR_FEAT_DYNAMIC_ROOT, - .ta_name = "vos_dkey", - .ta_ops = &key_btr_ops, - }, - { - .ta_class = VOS_BTR_AKEY, - .ta_order = VOS_KTR_ORDER, - .ta_feats = BTR_FEAT_UINT_KEY | BTR_FEAT_DIRECT_KEY | BTR_FEAT_DYNAMIC_ROOT, - .ta_name = "vos_akey", - .ta_ops = &key_btr_ops, - }, - { - .ta_class = VOS_BTR_SINGV, - .ta_order = VOS_SVT_ORDER, - .ta_feats = BTR_FEAT_DYNAMIC_ROOT, - .ta_name = "singv", - .ta_ops = &singv_btr_ops, - }, - { - .ta_class = VOS_BTR_END, - .ta_name = "null", - }, + { + .ta_class = VOS_BTR_DKEY, + .ta_order = VOS_KTR_ORDER, + .ta_feats = + BTR_FEAT_EMBED_FIRST | BTR_FEAT_UINT_KEY | BTR_FEAT_DIRECT_KEY | BTR_FEAT_DYNAMIC_ROOT, + .ta_name = "vos_dkey", + .ta_ops = &key_btr_ops, + }, + { + .ta_class = VOS_BTR_AKEY, + .ta_order = VOS_KTR_ORDER, + .ta_feats = + BTR_FEAT_EMBED_FIRST | BTR_FEAT_UINT_KEY | BTR_FEAT_DIRECT_KEY | BTR_FEAT_DYNAMIC_ROOT, + .ta_name = "vos_akey", + .ta_ops = &key_btr_ops, + }, + { + .ta_class = VOS_BTR_SINGV, + .ta_order = VOS_SVT_ORDER, + .ta_feats = BTR_FEAT_DYNAMIC_ROOT, + .ta_name = "singv", + .ta_ops = &singv_btr_ops, + }, + { + .ta_class = VOS_BTR_END, + .ta_name = "null", + }, }; static int @@ -892,6 +908,13 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, ta = obj_tree_find_attr(tclass, flags); + /** Single value tree uses major epoch for hash key and minor + * epoch for key so it doesn't play nicely with embedded value + * and even if it did, it would not be more efficient. + */ + if (ta->ta_class != VOS_BTR_SINGV && (pool->vp_feats & VOS_POOL_FEAT_EMBED_FIRST)) + tree_feats |= BTR_FEAT_EMBED_FIRST; + D_DEBUG(DB_TRACE, "Create dbtree %s feats 0x"DF_X64"\n", ta->ta_name, tree_feats); diff --git a/utils/ansible/ftest/templates/daos-launch.sh.j2 b/utils/ansible/ftest/templates/daos-launch.sh.j2 index 3969e1b88214..56c1f65df043 100644 --- a/utils/ansible/ftest/templates/daos-launch.sh.j2 +++ b/utils/ansible/ftest/templates/daos-launch.sh.j2 @@ -1,5 +1,5 @@ #!/bin/bash -# shellcheck disable=SC1000-SC9999 +# shellcheck disable=all # Jinja2 template can not be parsed with shellcheck set -o pipefail diff --git a/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 b/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 index 485b4c9a1aa2..c4009a0f2a5e 100644 --- a/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 +++ b/utils/ansible/ftest/templates/daos-launch_nlt.sh.j2 @@ -1,5 +1,5 @@ #!/bin/bash -# shellcheck disable=SC1000-SC9999 +# shellcheck disable=all # Jinja2 template can not be parsed with shellcheck set -o pipefail diff --git a/utils/ansible/ftest/templates/daos-make.sh.j2 b/utils/ansible/ftest/templates/daos-make.sh.j2 index ee4e9dec7de9..4ad23399ed52 100644 --- a/utils/ansible/ftest/templates/daos-make.sh.j2 +++ b/utils/ansible/ftest/templates/daos-make.sh.j2 @@ -1,5 +1,5 @@ #!/bin/bash -# shellcheck disable=SC1000-SC9999 +# shellcheck disable=all # Jinja2 template can not be parsed with shellcheck set -o pipefail diff --git a/utils/ci/run_in_gha.sh b/utils/ci/run_in_gha.sh index 6ec76e66e683..fda112b4c873 100755 --- a/utils/ci/run_in_gha.sh +++ b/utils/ci/run_in_gha.sh @@ -69,10 +69,8 @@ echo ::group::Config file after ALT_PREFIX build cat daos.conf echo ::endgroup:: -echo ::group::Install pydaos -cd src/client -python3 setup.py install -cd - +echo ::group::Install pydaos via pip +pip install /opt/daos/lib/daos/python echo ::endgroup:: echo ::group::Setting up daos_server_helper diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index c58f11d5d74b..38385410ca27 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -202,16 +202,12 @@ # ## Number of hugepages to allocate for DMA buffer memory # -## Optional parameter that should only be set if overriding the automatically calculated value is -## necessary. -# -## Specifies the number (not size) of hugepages to allocate for use by NVMe -## through SPDK. Note that each target requires 1 GiB of hugepage space. -## In DAOS version 2.2 and newer, nr_hugepages specifies the total across all -## engines on a host. It needs to represent the total amount of hugepages memory -## required for all targets across all engines on a host, divided by the system -## hugepage size. If not set here, it will be automatically calculated based on -## the number of targets (using the default system hugepage size). +## Optional parameter that should only be set if overriding the automatically calculated value is # +## #necessary. Specifies the number (not size) of hugepages to allocate for use by NVMe through +## #SPDK. For optimum performance each target requires 1 GiB of hugepage space. The provided value +## should be calculated by dividing the total amount of hugepages memory required for all targets +## across all engines on a host by the system hugepage size. If not set here, the value will be +## automatically calculated based on the number of targets (using the default system hugepage size). # ## Example: (2 engines * (16 targets/engine * 1GiB)) / 2MiB hugepage size = 16834 # @@ -219,8 +215,7 @@ #nr_hugepages: 0 # ## Hugepages are mandatory with NVME SSDs configured and optional without. -## To disable the use of hugepages when no NVMe SSDs are configured, set -## disable_hugepages to true. +## To disable the use of hugepages when no NVMe SSDs are configured, set disable_hugepages to true. # ## default: false #disable_hugepages: false @@ -448,6 +443,15 @@ # - meta # - wal # +# # Set criteria for automatic detection and eviction of faulty NVMe devices. The +# # default criteria parameters are `enable: true`, `max_io_errs: 10` and +# # `max_csum_errs: ` (essentially eviction due to checksum errors is +# # disabled by default). +# bdev_auto_faulty: +# enable: true +# max_io_errs: 100 +# max_csum_errs: 200 +# # #- # # Number of I/O service threads (and network endpoints) per engine. @@ -596,3 +600,11 @@ # # # See about bdev_roles above. # bdev_roles: [wal, meta, data] +# +# # Disable automatic detection and eviction of faulty NVMe devices. The default +# # criteria parameters are `enable: true`, `max_io_errs: 10` and +# # `max_csum_errs: ` (essentially eviction due to checksum errors is +# # disabled by default). +# +# bdev_auto_faulty: +# enable: false diff --git a/utils/cq/words.dict b/utils/cq/words.dict index d6a5f6f80a6b..008286978a76 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -216,6 +216,7 @@ iface indata infiniband init +inode interoperability io iod diff --git a/utils/docker/examples/.env b/utils/docker/examples/.env new file mode 100644 index 000000000000..8ac0c2fd4aa9 --- /dev/null +++ b/utils/docker/examples/.env @@ -0,0 +1,30 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Configuration file for DAOS docker compose + +# XXX Arguments which must be defined XXX +# DAOS_CLIENT_UNAME="foo" +# DAOS_CLIENT_UID="1001" +# DAOS_CLIENT_GNAME="bar" +# DAOS_CLIENT_GID="1001" + +# DAOS client runtime variables +DAOS_AGENT_RUNTIME_DIR="/var/run/daos_agent" + +# DAOS Authentication runtime variables +DAOS_SERVER_CERTS_TXZ="secrets/daos_server-certs.txz" +DAOS_ADMIN_CERTS_TXZ="secrets/daos_admin-certs.txz" +DAOS_AGENT_CERTS_TXZ="secrets/daos_agent-certs.txz" + +# Optional Arguments which should not be updated +BUST_CACHE="" +LINUX_DISTRO="el8" +LINUX_IMAGE_NAME="rockylinux/rockylinux" +LINUX_IMAGE_TAG="8.8" +DAOS_REPOS="https://packages.daos.io/v2.4/EL8/packages/x86_64/" +DAOS_GPG_KEYS="https://packages.daos.io/v2.4.0/RPM-GPG-KEY-2023" +DAOS_REPOS_NOAUTH="" +DAOS_VERSION="2.4.0-2.el8" +DAOS_DOCKER_IMAGE_NSP="daos" +DAOS_DOCKER_IMAGE_TAG="v2.4.0" diff --git a/utils/docker/examples/README.md b/utils/docker/examples/README.md new file mode 100644 index 000000000000..667b0d2461fb --- /dev/null +++ b/utils/docker/examples/README.md @@ -0,0 +1,548 @@ +# DAOS in Docker + +This document describes different ways to build and deploy Docker images for running application +using a DAOS storage system. This document also presents how to containerize a DAOS server and +the SPDK setup third party tool. + + +## Prerequisites + +To build and deploy the Docker images, `docker` cli shall be available. +The docker host should have access to the [Docker Hub](https://hub.docker.com/) and +[Rocky Linux](https://rockylinux.org/) official repositories. + +The platform was tested and validated with the following dependencies: +- [Docker CE](https://docs.docker.com/engine/install/centos/) latest + [RPMs](https://download.docker.com/linux/centos/docker-ce.repo) +- [DAOS 2.4](https://docs.daos.io/v2.4/) official [RPMS](https://packages.daos.io/v2.4/) +- [rockylinux/rockylinux:8.8](https://hub.docker.com/r/rockylinux/rockylinux/) official docker + images. + + +## Building DAOS Cloud Base Docker Image + +This section describes how to build the base Docker image used for building the DAOS docker images +of the following sections. The easiest way is to use the `docker compose` sub command. The first +step is to update the docker environment file "utils/docker/examples/.env" according to the +targeted DAOS system. The following environment variables must be defined for being able to +properly build a docker image: +- `DAOS_CLIENT_UNAME`: User name of the client (e.g. "foo") +- `DAOS_CLIENT_UID`: User id of the client (e.g., "1001") +- `DAOS_CLIENT_GNAME`: Group name of the client (e.g., "bar") +- `DAOS_CLIENT_GID`: Group id of the client (e.g., "1001") + +The following environment variables allow to customize the Docker image to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the base DAOS docker image (default "v2.4.0") +- `BUST_CACHE`: Manage docker building cache (default ""). To invalidate the cache, a random value + such as the date of day shall be given. +- `LINUX_IMAGE_NAME`: Base docker image name to use (default "rockylinux/rockylinux") +- `LINUX_IMAGE_TAG`: Tag identifier of the base docker image to use (default "8.8") +- `DAOS_REPOS`: Space separated list of repos needed to install DAOS (default + "https://packages.daos.io/v2.4/EL8/packages/x86\_64/") +- `DAOS_GPG_KEYS`: Space separated list of GPG keys associated with DAOS repos (default + "https://packages.daos.io/v2.4.0/RPM-GPG-KEY-2023") +- `DAOS_REPOS_NOAUTH`: Space separated list of repos to use without GPG authentication + (default "") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build the base DAOS +docker image. +```bash +docker compose --file utils/docker/examples/docker-compose.base.yml build daos_base +``` + +!!! note + If the node running the docker host is using a name service such as NIS or LDAP, it can be more + adapted to export this service inside the docker container. + + +## DAOS SPDK Setup Containerization + +This section presents how to build and run a Docker image allowing to prepare the NVMe devices. +This docker image is mainly intended to detach the NVMe devices from the kernel and then to allow +the SPDK library to manage it. + +### Docker Host Prerequisites + +According to the targeted DAOS server configuration, the Docker host should respect the same +[hardware requirements](https://docs.daos.io/v2.4/admin/hardware) and +[system setup](https://docs.daos.io/v2.4/admin/predeployment_check/) as if the DAOS server was +running on bare metal. + +The [hugepages](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt) linux kernel feature +shall also be enabled on the docker host. At least, 4096 pages of 2048KiB should be available. The +number of huge pages allocated can be checked with the following command. +```bash +sysctl vm.nr_hugepages +``` + +The default size of a huge page, the number of available huge pages, etc. can be found with the +following command. +```bash +cat /proc/meminfo | grep -e "^Huge" +``` + +!!! warning + Currently, the [VFIO](https://docs.kernel.org/driver-api/vfio.html) driver is not supported when + the DAOS server is containerized. This last driver shall be + [deactivated](https://docs.daos.io/v2.4/admin/predeployment_check/#enable-iommu) to let + [SPDK](https://spdk.io/) use the + [UIO](https://www.kernel.org/doc/html/v4.12/driver-api/uio-howto.html) driver. + +### Building Docker Image + +This section describes how to build a Docker image wrapping the SPDK setup script. Firstly, update +the docker environment file "utils/docker/examples/.env" to customize the Docker image to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS base docker image (default "v2.4.0") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build the DAOS +Server docker image. +```bash +docker compose --file utils/docker/examples/docker-compose.spdk_setup.yml build daos_spdk_setup +``` + +### Running Docker Image + +To check the status of the NVMe devices, run the following command. +```bash +docker compose --file utils/docker/examples/docker-compose.spdk_setup.yml run --rm daos_spdk_setup status +``` + +The following output indicates that the NVMe devices are not usable with SPDK because they are +managed by the kernel. +``` +Type BDF Vendor Device NUMA Driver Device Block devices +NVMe 0000:83:00.0 8086 0953 1 nvme nvme1 nvme1n1 +NVMe 0000:84:00.0 8086 0953 1 nvme nvme0 nvme0n1 +``` + +Run the following command to detach them from the kernel and let SPDK manage them. +```bash +docker compose --file utils/docker/examples/docker-compose.spdk_setup.yml run --rm daos_spdk_setup config +``` + +After running this command, running the status sub-command should produce the following output. +``` +Type BDF Vendor Device NUMA Driver Device Block devices +NVMe 0000:83:00.0 8086 0953 1 uio_pci_generic - - +NVMe 0000:84:00.0 8086 0953 1 uio_pci_generic - - +``` + +## DAOS Server Containerization + +This section presents how to build and deploy a Docker image running a DAOS server. + +### Building Docker Image + +This section describes how to build a Docker image of a DAOS server. The first step is to create +the "daos\_server.yml" configuration file and to place it in the directory +"utils/docker/examples/daos-server/el8". Example of such configuration file is available in this +last directory. Defining the content of this configuration files is out of scope of this +documentation. Please refer to the section "Create Configuration Files" of the +docs/QSG/setup\_rhel.md or docs/QSG/setup\_suse.md for detailed instructions. + +!!! warning + The 'disable_vfio' yaml property of the "daos\_server.yml" configuration file shall be set to + use the UIO driver instead of the VFIO one (which is not yet supported with docker). + +In a second time, update the docker environment file "utils/docker/examples/.env" to customize the +Docker image to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS base docker image (default "v2.4.0") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build the DAOS +Server docker image. +```bash +docker compose --file utils/docker/examples/docker-compose.server.yml build daos_server +``` + +### Running Docker Image + +This section presents how to run the image of a containerized DAOS server thanks to docker compose. +In a first time, a compressed tarball (i.e. `tar` archive compressed with `xz`) of the DAOS +certificate files needs to be created when the DAOS authentication is enabled. Creating this +tarball is out of the scope of this documentation. Please refer to the section "Generate +certificates" of the docs/QSG/setup\_rhel.md or docs/QSG/setup\_suse.md for detailed instructions. + +For using Docker Compose the tarball of the certificates file path must be readable by all users and +its file path defined in the following variable of the docker environment file +"utils/docker/examples/.env": +- `DAOS_SERVER_CERTS_TXZ`: tarball containing the DAOS certificated needed by the DAOS server + (e.g. "secrets/daos\_server-certs.txz"). + +This tarball has to contains at least the following files: +``` +tar tvJf secrets/daos_server-certs.txz +-rw-r--r-- ckochhof/ckochhof 1436 2023-09-15 14:45 daosCA.crt +-rw-r--r-- ckochhof/ckochhof 5287 2023-09-15 14:45 server.crt +-r-------- ckochhof/ckochhof 2459 2023-09-15 14:45 server.key +-rw-r--r-- ckochhof/ckochhof 5238 2023-09-15 14:45 agent.crt +``` + +!!! note + For properly managing secret, Docker Stack should be used instead of Docker Compose. Sadly, + several docker compose configuration options needed for running a containerized DAOS server, such + as [privileged](https://github.com/moby/swarmkit/pull/3072), are not yet supported in swarm + mode. + +When the tarball has been created and the environment file properly filled, run the following +command to start a DAOS server. +```bash +docker compose --file utils/docker/examples/docker-compose.server.yml run --rm daos_server +``` + + +## DAOS Admin Containerization + +This section presents how to build and run a Docker image allowing to administrate a DAOS file +system. + +### Building Docker Image + +This section describes how to build a Docker image allowing to administrate a DAOS filesystem +through the DAOS Management Tool (i.e. dmg) CLI. The first step is to create the +"daos\_control.yml" configuration file and to place it in the directory +"utils/docker/examples/daos-admin/el8". Example of such configuration file is avalailable in this +last directory. Defining the content of this configuration files is out of scope of this +documentation. Please refer to the section "Create Configuration Files" of the +docs/QSG/setup\_rhel.md or docs/QSG/setup\_suse.md for detailed instructions. + +In a second time, update the following environment variables of the docker environment file +"utils/docker/examples/.env" to customize the Docker image to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS base docker image (default "v2.4.0") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build the DAOS +admin docker image. +```bash +docker compose --file utils/docker/examples/docker-compose.admin.yml build daos_admin +``` + +### Running Docker Image + +This section presents two different ways for running a DAOS admin container. For both methods, +a compressed tarball (i.e. `tar` archive compressed with `xz`) of the DAOS certificate files should +be created when the DAOS authentication is enabled. +However, it is not managed in the same way with both solutions. + +This tarball has to contains at least the following files: +``` +tar tvJf secrets/daos_admin-certs.txz +-rw-r--r-- ckochhof/ckochhof 1436 2023-09-15 14:45 daosCA.crt +-rw-r--r-- ckochhof/ckochhof 5238 2023-09-15 14:45 admin.crt +-r-------- ckochhof/ckochhof 2459 2023-09-15 14:45 admin.key +``` + +#### Running with Docker Compose + +For using Docker Compose the tarball of the certificates file path should be readable by all users +and its file path defined in the following variable of the docker environment file +"utils/docker/examples/.env": +- `DAOS_ADMIN_CERTS_TXZ`: tarball containing the DAOS certificated needed by the dmg CLI (e.g. + "secrets/daos\_admin-certs.txz") + +When the environment file has been properly filled, run the following commands to format the DAOS +file system and to create a DAOS pool using all the available storage. +```bash +docker compose --file utils/docker/examples/docker-compose.admin.yml run --rm daos_admin +dmg storage format +dmg system query --verbose +dmg pool create --size=100% --user= --group= +dmg pool query +``` + +#### Running with Docker Stack + +With Docker Stack the tarball of the certificates are managed as [Docker +Secret](https://docs.docker.com/engine/swarm/secrets/). Docker Secret is a swarm service allowing +to securely store and access blob of data. For recording a tarball containing the DAOS admin +certificates, execute the following commands. +```bash +docker swarm init +docker secret create daos_admin-certs +``` + +As soon as the Docker secret has been created, run the following commands to format the DAOS +filesystem and to create a DAOS pool using all the available storage. +```bash +bash utils/docker/examples/deploy-docker_stack.sh utils/docker/examples/docker-stack.admin.yml +docker exec -ti bash +dmg storage format +dmg system query --verbose +dmg pool create --size=100% --user= --group= +dmg pool query +``` + + +## DAOS Client Containerized with Bare Metal DAOS Agent + +With the deployment solution presented in this section, the DAOS client is running in a docker +container and the DAOS Agent is running on the docker host node. + +The first step is to install and configure the `daos_agent` service on the docker host. +Installation and deploymnent of this service is out of the scope of this documentation. +Please refer to docs/QSG/setup\_rhel.md or docs/QSG/setup\_suse.md for detailed instructions. + +### Building Docker Image + +This section describes how to build Docker image allowing to access a DAOS file system through +a DAOS agent running on the docker host. Firstly, update the docker environment file +"utils/docker/examples/.env" to customize the Docker image to build: +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "v2.4.0") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +Run the following command to create a DAOS client docker image using a bare metal DAOS agent. +```bash +docker compose --file utils/docker/examples/docker-compose.client_bm.yml build daos_client_bm +``` + +!!! note + It is not needed to copy or share the certificates of the DAOS agent running on the docker host + in the Docker image. + +### Running Docker Image + +This section presents how to run some relevant use cases with a docker image build according to the +previous section. Firstly, define the following environment variables of the docker environment file +"utils/docker/examples/.env": +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_CLIENT_UID`: User id of the client (e.g. "1001") +- `DAOS_CLIENT_GID`: Group id of the client (e.g. "1001") +- `DAOS_AGENT_RUNTIME_DIR`: Directory containing the DAOS agent socket (e.g. "/var/run/daos\_agent") + +When the environment file has been properly filled, execute the following commands to run an +autotest of the DAOS pool created in the section [Running DAOS Admin Docker Image](#anchor-001). +```bash +docker compose --file utils/docker/examples/docker-compose.client_bm.yml run --rm daos_client_bm +daos pool autotest +``` + +With the same prerequites, execute the following command to run a [fio](https://fio.readthedocs.io/) +file system benchmark. +```bash +docker compose --file utils/docker/examples/docker-compose.client_bm.yml run --rm daos_client_bm +mkdir -p "/home//mnt" +daos container create --type=posix posix-fs +dfuse "/home//mnt" posix-fs +df --human-readable --type=fuse.daos +fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" +``` + +### Configuring Bare Metal DAOS Agent + +When a Docker Engine service is installed on a node, it creates a virtual interface `docker0`. This +last virtual interface could be misused by the DAOS agent. To overcome this issue, update the +`fabric_ifaces` section of the "daos\_agent.yml" configuration file. The following example shows +how to force the daos\_agent to use the eth0 network interface device. +```yaml +fabric_ifaces: +- numa_node: 0 + devices: + - iface: eth0 + domain: eth0 +``` + + +## DAOS Client and Agent Containerized + +With the deployment solution presented in this section, the DAOS client and the DAOS Agent are +running in two different docker containers. + +### Building DAOS Client Docker Image + +This image is using the same environment variables as the one used for building the DAOS Client +Docker of the section [Building DAOS client Docker Image](#anchor-003). + +When the environment file has been properly filled, run the following command to create a DAOS +client using a containerized DAOS agent. +```bash +docker compose --file utils/docker/examples/docker-compose.client_sa.yml build daos_client_sa +``` + +### Building DAOS Agent Docker Image + +This section describes how to build the Docker container running the DAOS agent service allowing the +DAOS client container to access a DAOS file system. The first step is to create a "daos\_agent.yml" +configuration file and to place it in the directory "utils/docker/examples/daos-agent/el8". +Defining the content of this configuration files is out of scope of this documentation. Please +refer to the section "Create Configuration Files" of the docs/QSG/setup\_rhel.md or +docs/QSG/setup\_suse.md for detailed instructions. + +!!! warning + As for the bare metal DAOS agent, the `fabric_ifaces` section of the "daos\_agent.yml" + configuration file should be defined. + +In a second time, update the docker environment file "utils/docker/examples/.env" to customize the +Docker image to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the base DAOS docker image (default "v2.4.0") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build the DAOS +Agent docker image. +```bash +docker compose --file utils/docker/examples/docker-compose.client_sa.yml build daos_agent +``` + +### Running Docker Images + +This section presents how to run some relevant use cases with a Docker image build according to the +previous section. In a first time, define the following environment variables of the docker +environment file "utils/docker/examples/.env": +- `DAOS_CLIENT_UID`: User id of the client (e.g. "1001") +- `DAOS_CLIENT_GID`: Group id of the client (e.g. "1001") + +In a second time, a compressed tarball (i.e. `tar` archive compressed with `xz`) of the DAOS +certificate files needs to be created when the DAOS authentication is enabled. For using Docker +Compose the tarball of the certificates file path should be readable by all users and its file path +defined in the following variable of the docker environment file "utils/docker/examples/.env": +- `DAOS_AGENT_CERTS_TXZ`: tarball containing the DAOS certificated needed by the DAOS agent + (e.g. "secrets/daos\_agent-certs.txz") + +This tarball has to contains at least the following files: +``` +tar tvJf secrets/daos_agent-certs.txz +-rw-r--r-- ckochhof/ckochhof 1436 2023-09-15 14:45 daosCA.crt +-rw-r--r-- ckochhof/ckochhof 5238 2023-09-15 14:45 agent.crt +-r-------- ckochhof/ckochhof 2455 2023-09-15 14:45 agent.key +``` + +!!! note + As [Docker Secret](https://docs.docker.com/engine/swarm/secrets/) is a Docker Swarm service, it + could not be used properly with Docker Compose. With Docker Compose, secret are managed as + standard Docker Volume mounted in `/run/secrets` directory. More details could be found at: + https://github.com/docker/compose/issues/9139#issuecomment-1098137264 + +!!! note + For properly managing secret, Docker Stack should be used instead of Docker Compose. Sadly, the + `pid=host` option is not yet supported in swarm mode, and this last one is mandatory to allow + the DAOS Agent to monitor its associated clients. More details could be found at: + https://github.com/docker/docs/issues/5624 and https://github.com/moby/swarmkit/issues/1605 + +When the environment file has been properly filled, execute the following commands to run an +autotest of the DAOS pool created in the section [Running DAOS Admin Docker +Image](#anchor-001). +```bash +docker compose --file utils/docker/examples/docker-compose.client_sa.yml up --detach daos_agent +docker compose --file utils/docker/examples/docker-compose.client_sa.yml run --rm daos_client_sa +daos pool autotest +``` + +With the same prerequites, execute the following command to run a [fio](https://fio.readthedocs.io/) +file system benchmark. +```bash +docker compose --file utils/docker/examples/docker-compose.client_sa.yml up --detach daos_agent +docker compose --file utils/docker/examples/docker-compose.client_sa.yml run --rm daos_client_sa +mkdir -p "/home//mnt" +daos container create --type=posix posix-fs +dfuse "/home//mnt" posix-fs +df --human-readable --type=fuse.daos +fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" +``` + + +## DAOS Client and Agent Gathered + +With the deployment solution presented in this section, the DAOS client and the DAOS Agent are +running in the same container. + +### Building Docker Images + +This section describes how to build the `daos-client` docker image. + +The easiest way to build this image is to use the `docker compose` sub command. The first step is +to build the `daos_agent` image. The procedure for building this image is the same as the one +described in the section [Building DAOS Agent Docker Image](#anchor-002). + +In a second time, define the following environment variables of the Docker environment file +"utils/docker/examples/.env": +- `DAOS_CLIENT_UNAME`: User name of the client (e.g. "foo") +- `DAOS_CLIENT_GNAME`: Group name of the client (e.g., "bar") + +Finally, update the docker environment file "utils/docker/examples/.env" to customize the Docker +images to build: +- `LINUX_DISTRO`: Linux distribution identifier (default "el8") +- `DAOS_DOCKER_IMAGE_NSP`: Namespace identifier of the base DAOS docker image (default "daos") +- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "v2.4.0") +- `DAOS_VERSION`: Version of DAOS to use (default "2.4.0-2.el8") + +When the environment file has been properly filled, run the following command to build a DAOS +Client Docker image running its own DAOS Agent service. +```bash +docker compose --file utils/docker/examples/docker-compose.client_gt.yml build daos_client_gt +``` + +### Running Docker Images + +This section presents two different ways for running some relevant use cases with a docker image +build according to the previous section. For both methods, a compressed tarball (i.e. `tar` archive +compressed with `xz`) of the DAOS certificate files should be created when the DAOS authentication +is enabled. However, it is not managed in the same way with both solutions. + +#### Running with Docker Compose + +For using Docker Compose the tarball of the certificates file path should be readable by all users +and its file path defined in the following variable of the docker environment file +"utils/docker/examples/.env": +- `DAOS_AGENT_CERTS_TXZ`: tarball containing the DAOS certificated needed by the DAOS agent + (e.g. "secrets/daos\_agent-certs.txz") + +In a second time, define the following environment variables of the Docker environment file +"utils/docker/examples/.env": +- `DAOS_CLIENT_UID`: User id of the client (e.g. "1001") +- `DAOS_CLIENT_GID`: Group id of the client (e.g. "1001") + +When the environment file has been properly filled, execute the following commands to run an +autotest of the DAOS pool created in the section [Running DAOS Admin Docker Image](#anchor-001). +```bash +docker compose --file utils/docker/examples/docker-compose.client_gt.yml run --rm daos_client_gt +daos pool autotest +``` + +With the same prerequites, execute the following command to run a [fio](https://fio.readthedocs.io/) +file system benchmark. +```bash +docker compose --file utils/docker/examples/docker-compose.yml run --rm daos_client_gt +mkdir -p "/home//mnt" +daos container create --type=posix posix-fs +dfuse "/home//mnt" posix-fs +df --human-readable --type=fuse.daos +fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" +``` + +#### Running with Docker Stack + +With Docker Stack the tarball of the certificates are managed as [Docker +Secret](https://docs.docker.com/engine/swarm/secrets/). Docker Secret is a swarm service allowing +to securely store and access blob of data. For recording a tarball containing the DAOS agent +certificates, execute following commands. +```bash +docker swarm init +docker secret create daos_agent-certs +``` + +As soon as the Docker secret has been created, execute the following commands to run an +autotest of the DAOS pool created in the section [Running DAOS Admin Docker Image](#anchor-001). +```bash +bash utils/docker/examples/deploy-docker_stack.sh utils/docker/examples/docker-stack.client_gt.yml +docker exec -u${DAOS_CLIENT_UID}:${DAOS_CLIENT_GID} -ti bash +daos pool autotest +``` + +!!! note + At this time, it is not possible to use + [DFuse](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) inside a stack + deployed in swarm mode. Indeed, the docker option + [devices](https://docs.docker.com/compose/compose-file/compose-file-v3/#devices) is not + supported, and thus it is not possible to export the "/dev/fuse" device needed by DFuse. diff --git a/utils/docker/examples/client/.env b/utils/docker/examples/client/.env deleted file mode 100644 index cd64391ce484..000000000000 --- a/utils/docker/examples/client/.env +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2021-2022 Intel Corporation -# All rights reserved. -# -# Configuration file for DAOS docker compose - -# XXX Arguments which must be defined XXX -# DAOS_CLIENT_UNAME="foo" -# DAOS_CLIENT_UID="666" -# DAOS_CLIENT_GNAME="bar" -# DAOS_CLIENT_GID="999" -# DAOS_ACCESS_POINTS="['hostname1', 'hostname2']" - -# Optional arguments -DAOS_PORT="10001" -DAOS_IFACE_CFG="yes" -DAOS_IFACE_NUMA_NODE="0" -DAOS_IFACE_NAME="eth0" -DAOS_IFACE_DOMAIN_NAME="eth0" -DAOS_AUTH="yes" -DAOS_AGENT_RUNTIME_DIR="/var/run/daos_agent" -# XXX Arguments which must be defined with docker compose XXX -# DAOS_AGENT_CERTS_TXZ="secrets/daos_agent-certs.txz" - -# Arguments which should not be updated -DAOS_DOCKER_IMAGE_TAG="rocky8.6" -RHEL_BASE_IMAGE_NAME="rockylinux/rockylinux" -RHEL_BASE_IMAGE_TAG="8.6" -BUST_CACHE="" -DAOS_REPOS="https://packages.daos.io/v2.2/EL8/packages/x86_64/" -DAOS_GPG_KEYS="https://packages.daos.io/RPM-GPG-KEY" -DAOS_REPOS_NOAUTH="" -DAOS_VERSION="2.2.0-4.el8" diff --git a/utils/docker/examples/client/README.md b/utils/docker/examples/client/README.md deleted file mode 100644 index 547690528c27..000000000000 --- a/utils/docker/examples/client/README.md +++ /dev/null @@ -1,301 +0,0 @@ -# DAOS in Docker - -This document describes different ways to build and deploy base Docker images for running -application using a DAOS storage system. - - -## Prerequisites - -To build and deploy the Docker images, `docker` cli shall be available. -The docker host should have access to the [Docker Hub](https://hub.docker.com/) and -[Rocky Linux](https://rockylinux.org/) official repositories. - -The platform was tested and validated with the following dependencies: -- [Docker CE](https://docs.docker.com/engine/install/centos/) latest - [RPMs](https://download.docker.com/linux/centos/docker-ce.repo) -- [DAOS 2.2](https://docs.daos.io/v2.2/) official [RPMS](https://packages.daos.io/v2.2/) -- [rockylinux/rockylinux:8.6](https://hub.docker.com/r/rockylinux/rockylinux/) official docker - images. - - -## Building DAOS Cloud Base Docker Image - -This section describes how to build the base Docker image used for building the DAOS docker images -of the following sections. The easiest way is to use the `docker compose` sub command. The first -step is to update the docker environment file `utils/docker/examples/client/.env` according to the -targeted DAOS system. The following environment variables must be defined for being able to -properly build a docker image: -- `DAOS_CLIENT_UNAME`: User name of the client (e.g. "foo") -- `DAOS_CLIENT_UID`: User id of the client (e.g., "666") -- `DAOS_CLIENT_GNAME`: Group name of the client (e.g., "bar") -- `DAOS_CLIENT_GID`: Group id of the client (e.g., "999") - -!!! note - If the node running the docker host is using a name service such as NIS or LDAP, it could be - more adapted to export this service inside the docker container. - -The following environment variables allow to customize the Docker image to build: -- `BUST_CACHE`: Manage docker building cache (default ""). To invalidate the cache, a random value - such as the date of day shall be given. -- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "rocky8.6") -- `RHEL_BASE_IMAGE_NAME`: Base docker image name to use (default "rockylinux/rockylinux") -- `RHEL_BASE_IMAGE_TAG`: Tag identifier of the base docker image to use (default "8.6") -- `DAOS_REPOS`: Space separated list of repos needed to install DAOS (default - "https://packages.daos.io/v2.2/EL8/packages/x86\_64/") -- `DAOS_GPG_KEYS`: Space separated list of GPG keys associated with DAOS repos (default - "https://packages.daos.io/RPM-GPG-KEY") -- `DAOS_REPOS_NOAUTH`: Space separated list of repos to use without GPG authentication - (default "") -- `DAOS_VERSION`: Version of DAOS to use (default "2.2.0-4.el8") - -When the environment file has been properly filled, the docker image could be created thanks to the -following command: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_base.yml build -``` - - -## DAOS Client Containerized with Bare Metal DAOS Agent - -With the deployment solution presented in this section, the DAOS client is running in a docker -container and the DAOS Agent is running on the docker host node. - -### Building DAOS Client Docker Image - -This section describes how to build Docker container allowing to access a DAOS file system through -a DAOS agent running on the docker host. The easiest way is to use the `docker compose` sub -command. The first step is to update the docker environment file -`utils/docker/examples/client/.env` according to the targeted DAOS system. - -The following environment variables allow to customize the Docker image to build: -- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "rocky8.6") - -The docker image could be then created thanks to the following command: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client.standalone.yml build -``` - -!!! note - It is not needed to copy or share the certificates of the DAOS agent running on the docker host - in the Docker image. - -### Running DAOS Client Docker Image - -This section presents how to run some relevant use cases with a docker image build according to the -previous section. Firstly the following environment variables of the docker environment file -`utils/docker/examples/client/.env` must be defined: -- `DAOS_CLIENT_UID`: User id of the client (e.g., "666") -- `DAOS_CLIENT_GID`: Group id of the client (e.g., "999") - -It could also be needed to define the following environment variable according to the configuration -of DAOS agent running on the docker host: -- `DAOS_AGENT_RUNTIME_DIR`: Directory containing the DAOS agent socket (default `/var/run/daos_agent`) - -When the environment file has been properly filled, the `daos pool autotest` could be run thanks to -the following commands: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client.standalone.yml run --rm daos_client -$ daos pool autotest -``` - -With the same prerequites, the [fio](https://fio.readthedocs.io/) file system benchmark tool could -be run thanks to the following commands: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client.standalone.yml run --rm daos_client -$ mkdir -p "/home//mnt" -$ daos container create --type=posix --label=posix-fs tank -$ dfuse --mountpoint="/home//mnt" --pool=tank --container=posix-fs -$ df --human-readable --type=fuse.daos -$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" -``` - -### Docker Host Configuration - -When a Docker Engine service is installed on a node it creates a virtual interface `docker0` which -could be misused by the DAOS agent. To overcome this issue, the `fabric_ifaces` section of the -`daos_agent.yml` configuration file could be used, as illustrated on the following example: -```yaml -fabric_ifaces: -- numa_node: 0 - devices: - - iface: eth0 - domain: eth0 -``` - - -## DAOS Client and Agent Containerized - -With the deployment solution presented in this section, the DAOS client and the DAOS Agent are -running in two different docker containers. - -### Building DAOS Client Docker Image - -This image is using the same environment variables as the DAOS Client Docker image of the previous -section. - -When the environment file has been properly filled, the docker image could be created thanks to the -following command: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml build daos_client -``` - -### Building DAOS Agent Docker Image - -This section describes how to build the Docker container running the DAOS agent service allowing the -DAOS client container to access a DAOS file system. The following environment variables must be -defined for being able to properly build the docker image: -- `DAOS_ACCESS_POINTS`: List of DAOS management server access points (e.g. "['hostname1', - 'hostname2']") -- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "rocky8.6") - -The following environment variables allow to customize the Docker images to build: -- `DAOS_PORT`: DAOS access point port number to connect (default "1001") -- `DAOS_AUTH`: Enable DAOS authentication when set to "yes" (default "yes") -- `DAOS_IFACE_CFG`: Enable manual configuration of the interface to use by the agent (default "yes") -- `DAOS_IFACE_NUMA_NODE`: Numa node of the interface to use by the agent (default "0"). Defining - this variable is mandatory when `DAOS_IFACE_CFG` is enabled. -- `DAOS_IFACE_NAME`: Name of the interface to use by the agent (default ""). Defining this variable - is mandatory when `DAOS_IFACE_CFG` is enabled. -- `DAOS_IFACE_DOMAIN_NAME`: Domain name of the interface to use by the agent (default "eth0"). - Defining this variable is mandatory when `DAOS_IFACE_CFG` is enabled. - -!!! warning - On most of the system the`DAOS_IFACE_CFG` should be enabled to avoid the DAOS agent service of - using an invalid network interface such as the `docker0` virtual network interface. - -When the environment file has been properly filled, the docker image could be created thanks to the -following command: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml build daos_agent -``` - -### Running DAOS Docker Images - -This section presents how to run some relevant use cases with a docker image build according to the -previous section. In a first time the following environment variables of the docker environment -file `utils/docker/examples/client/.env` must be defined: -- `DAOS_CLIENT_UID`: User id of the client (e.g., "666") -- `DAOS_CLIENT_GID`: Group id of the client (e.g., "999") - -In a second time, a tarball (i.e. `tar` archive compressed with `xz`) of the DAOS certificate files -should be created when the DAOS authentication is enabled. For using Docker Compose the tarball of -the certificates file path readable by all users and its file path defined in the following variable -of the docker environment file `utils/docker/examples/client/.env`: -- `DAOS_AGENT_CERTS_TXZ`: tarball containing the DAOS certificated needed by the DAOS agent - (e.g. "secrets/daos\_agent-certs.txz") - -!!! note - As [Docker Secret](https://docs.docker.com/engine/swarm/secrets/) is a Docker Swarm service, it - could not be used properly with Docker Compose. With Docker Compose, secret are managed as - standard Docker Volume mounted in `/run/secrets` directory. More details could be found at: - https://github.com/docker/compose/issues/9139#issuecomment-1098137264 - -!!! note - For properly managing secret, Docker Stack should be used instead of Docker Compose. Sadly, the - `pid=host` option is not yet supported in swarm mode, and this last one is mandatory to allow - the DAOS Agent to monitor its associated clients. More details could be found at: - https://github.com/docker/docs/issues/5624 and https://github.com/moby/swarmkit/issues/1605 - -When the environment file has been properly filled, then an application such as `daos pool autotest` -could be run in the following way: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml up --detach daos_agent -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml run --rm daos_client -$ daos pool autotest -``` - -With the same prerequites, the [fio](https://fio.readthedocs.io/) file system benchmark tool could -be run thanks to the following commands: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml up --detach daos_agent -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml run --rm daos_client -$ mkdir -p "/home//mnt" -$ daos container create --type=posix --label=posix-fs tank -$ dfuse --mountpoint="/home//mnt" --pool=tank --container=posix-fs -$ df --human-readable --type=fuse.daos -$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" -``` - - -## DAOS Client and Agent Gathered - -With the deployment solution presented in this section, the DAOS client and the DAOS Agent are -running in the same container. - -### Building DAOS Client Docker Image - -This section describes how to build the `daos-client_agent` docker image. - -The easiest way to build this image is to use the `docker compose` sub command. The first step is -to update the docker environment file `utils/docker/examples/client/.env` according to the targeted -DAOS system. The following environment variables must be defined for being able to properly build -the docker image: -- `DAOS_DOCKER_IMAGE_TAG`: Tag identifier of the DAOS client docker image (default "rocky8.6") -- `DAOS_CLIENT_UNAME`: User name of the client (e.g. "foo") -- `DAOS_CLIENT_GNAME`: Group name of the client (e.g., "bar") - -When the environment file has been properly filled, the docker image could be created thanks to the -following commands: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml build daos_client_agent -``` - -### Running DAOS Docker Containers - -This section presents two different way for running some relevant use cases with a docker image -build according to the previous section. For both methods, a tarball (i.e. `tar` archive compressed -with `xz`) of the DAOS certificate files should be created when the DAOS authentication is enabled. -However, it is not managed in the same way with both solutions. - -#### Running DAOS Docker Images with Docker Compose - -For using Docker Compose the tarball of the certificates file path readable by all users and its -file path defined in the following variable of the docker environment file -`utils/docker/examples/client/.env`: -- `DAOS_AGENT_CERTS_TXZ`: tarball containing the DAOS certificated needed by the DAOS agent - (e.g. "secrets/daos\_agent-certs.txz") -- `DAOS_CLIENT_UID`: User id of the client (e.g., "666") -- `DAOS_CLIENT_GID`: Group id of the client (e.g., "999") - -When the environment file has been properly filled, then an application such as `daos pool autotest` -could be run in the following way: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml run --rm daos_client_agent -$ daos pool autotest -``` - -With the same prerequites as for the previous section, the [fio](https://fio.readthedocs.io/) file -system benchmark tool could be run thanks to the following commands: -```bash -docker compose --file utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml run --rm daos_client_agent -$ mkdir -p "/home//mnt" -$ daos container create --type=posix --label=posix-fs tank -$ dfuse --mountpoint="/home//mnt" --pool=tank --container=posix-fs -$ df --human-readable --type=fuse.daos -$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting --directory="/home//mnt" -``` - -#### Running DAOS Docker Images with Docker Stack - -With Docker Stack the tarball of the certificates are managed as [Docker -Secret](https://docs.docker.com/engine/swarm/secrets/). Docker Secret is a swarm service allowing -to securely store and access blob of data. Recording a tarball containing the DAOS agent -certificates could be done in the following way: -```bash -docker swarm init -docker secret create daos_agent-certs -``` - -As soon as the Docker secret has been created, an application such as `daos pool autotest` -could be run in the following way: -```bash -bash utils/docker/examples/client/deploy-docker_stack.sh utils/docker/examples/client/docker-stack.daos_client_agent.gathered.yml -docker exec -u${DAOS_CLIENT_UID}:${DAOS_CLIENT_GID} -ti bash -$ daos pool autotest -``` - -At this time, it is not possible to use -[DFuse](https://docs.daos.io/v2.2/user/filesystem/?h=dfuse#dfuse-daos-fuse) inside a stack deployed -in swarm mode. Indeed, the docker option -[devices](https://docs.docker.com/compose/compose-file/compose-file-v3/#devices) is not supported, -and thus it is not possible to export the `dev/fuse` device needed by DFuse. diff --git a/utils/docker/examples/client/daos-agent/el8/Dockerfile b/utils/docker/examples/client/daos-agent/el8/Dockerfile deleted file mode 100644 index 933299333f98..000000000000 --- a/utils/docker/examples/client/daos-agent/el8/Dockerfile +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2021-2023 Intel Corporation -# All rights reserved. -# -# 'recipe' for building a base RHEL DAOS client docker image -# -# This Dockerfile accept the following input build arguments: -# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "rocky8.6") -# - DAOS_ACCESS_POINTS Hostname list of the DAOS access points (mandatory) -# - DAOS_PORT DAOS access point port number to connect (default "1001") -# - DAOS_IFACE_CFG Enable manual configuration of the interface to use by the agent -# (default "yes") -# - DAOS_IFACE_NUMA_NODE Numa node of the interface to use by the agent (default "0") -# - DAOS_IFACE_NAME Name of the interface to use by the agent (default "eth0") -# - DAOS_IFACE_DOMAIN_NAME Domain name of the interface to use by the agent (default "eth0") -# - DAOS_AUTH Enable DAOS authentication when set to "yes" (default "yes") - -# Pull base image -ARG DAOS_DOCKER_IMAGE_TAG=rocky8.6 -FROM daos-base:$DAOS_DOCKER_IMAGE_TAG -LABEL maintainer="daos@daos.groups.io" - -# Install DAOS agent configuration file -ARG DAOS_ACCESS_POINTS="" -ARG DAOS_PORT="10001" -ARG DAOS_AUTH="yes" -ARG DAOS_IFACE_CFG="yes" -ARG DAOS_IFACE_NUMA_NODE="0" -ARG DAOS_IFACE_NAME="eth0" -ARG DAOS_IFACE_DOMAIN_NAME="eth0" -COPY daos_agent.yml.in /tmp/daos_agent.yml.in -RUN if [[ -z "${DAOS_ACCESS_POINTS}" ]] ; then \ - echo "[ERROR] Docker build argument DAOS_ACCESS_POINTS is not defined" ; \ - exit 1 ; \ - fi ; \ - sed --in-place --regexp-extended \ - --expression "s/@DAOS_ACCESS_POINTS@/${DAOS_ACCESS_POINTS}/" \ - --expression "s/@DAOS_PORT@/${DAOS_PORT}/" \ - /tmp/daos_agent.yml.in && \ - if [[ "${DAOS_AUTH}" == yes ]] ; then \ - sed --in-place --regexp-extended \ - --expression '/^@DAOS_NOAUTH_SECTION_BEGIN@$/,/^@DAOS_NOAUTH_SECTION_END@/d' \ - --expression '/(^@DAOS_AUTH_SECTION_BEGIN@$)|(^@DAOS_AUTH_SECTION_END@$)/d' \ - /tmp/daos_agent.yml.in ; \ - else \ - sed --in-place --regexp-extended \ - --expression '/^@DAOS_AUTH_SECTION_BEGIN@$/,/^@DAOS_AUTH_SECTION_END@/d' \ - --expression '/(^@DAOS_NOAUTH_SECTION_BEGIN@$)|(^@DAOS_NOAUTH_SECTION_END@$)/d'\ - /tmp/daos_agent.yml.in ; \ - fi && \ - if [[ "${DAOS_IFACE_CFG}" == yes ]] ; then \ - for it in DAOS_IFACE_NUMA_NODE DAOS_IFACE_NAME DAOS_IFACE_DOMAIN_NAME ; do \ - if eval "[[ -z \"\$$it\" ]]" ; then \ - echo "[ERROR] Docker build argument $it is not defined" ; \ - exit 1 ; \ - fi ; \ - done ; \ - sed --in-place --regexp-extended \ - --expression '/(^@DAOS_IFACE_SECTION_BEGIN@$)|(^@DAOS_IFACE_SECTION_END@$)/d' \ - --expression "s/@DAOS_IFACE_NUMA_NODE@/${DAOS_IFACE_NUMA_NODE}/" \ - --expression "s/@DAOS_IFACE_NAME@/${DAOS_IFACE_NAME}/" \ - --expression "s/@DAOS_IFACE_DOMAIN_NAME@/${DAOS_IFACE_DOMAIN_NAME}/" \ - /tmp/daos_agent.yml.in ; \ - else \ - sed --in-place --regexp-extended \ - --expression '/^@DAOS_IFACE_SECTION_BEGIN@$/,/^@DAOS_IFACE_SECTION_END@/d' \ - /tmp/daos_agent.yml.in ; \ - fi && \ - mv -f /tmp/daos_agent.yml.in /etc/daos/daos_agent.yml - - -# Install DAOS agent launcher -COPY run-daos_agent.in /tmp/run-daos_agent.in -RUN if [[ "${DAOS_AUTH}" == yes ]] ; then \ - dnf --assumeyes install tar xz sudo && \ - sed --regexp-extended \ - --expression '/(^@DAOS_AUTH_SECTION_BEGIN@$)|(^@DAOS_AUTH_SECTION_END@$)/d' \ - /tmp/run-daos_agent.in > /usr/local/sbin/run-daos_agent ; \ - else \ - dnf --assumeyes install sudo && \ - sed --regexp-extended \ - --expression '/^@DAOS_AUTH_SECTION_BEGIN@$/,/^@DAOS_AUTH_SECTION_END@/d' \ - /tmp/run-daos_agent.in > /usr/local/sbin/run-daos_agent ; \ - fi && \ - chmod 755 /usr/local/sbin/run-daos_agent && \ - rm -f /tmp/run-daos_agent.in - -# Define entrypoint and cmd: -# - ENTRYPOINT for the command to run -# - CMD for the default arguments -ENTRYPOINT ["/usr/local/sbin/run-daos_agent"] -CMD ["start"] diff --git a/utils/docker/examples/client/daos-agent/el8/daos_agent.yml.in b/utils/docker/examples/client/daos-agent/el8/daos_agent.yml.in deleted file mode 100644 index 5af1708e4df2..000000000000 --- a/utils/docker/examples/client/daos-agent/el8/daos_agent.yml.in +++ /dev/null @@ -1,26 +0,0 @@ -name: daos_server -access_points: @DAOS_ACCESS_POINTS@ -port: @DAOS_PORT@ -runtime_dir: /var/run/daos_agent -# control_log_mask: debug -control_log_mask: info -log_file: /tmp/daos_agent.log - -transport_config: -@DAOS_NOAUTH_SECTION_BEGIN@ - allow_insecure: true -@DAOS_NOAUTH_SECTION_END@ -@DAOS_AUTH_SECTION_BEGIN@ - allow_insecure: false - ca_cert: /etc/daos/certs/daosCA.crt - cert: /etc/daos/certs/agent.crt - key: /etc/daos/certs/agent.key -@DAOS_AUTH_SECTION_END@ - -@DAOS_IFACE_SECTION_BEGIN@ -fabric_ifaces: -- numa_node: @DAOS_IFACE_NUMA_NODE@ - devices: - - iface: @DAOS_IFACE_NAME@ - domain: @DAOS_IFACE_DOMAIN_NAME@ -@DAOS_IFACE_SECTION_END@ diff --git a/utils/docker/examples/client/daos-client/el8/Dockerfile b/utils/docker/examples/client/daos-client/el8/Dockerfile deleted file mode 100644 index 3838d910e287..000000000000 --- a/utils/docker/examples/client/daos-client/el8/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2021-2023 Intel Corporation -# All rights reserved. -# -# 'recipe' for building a base RHEL DAOS client docker image -# -# This Dockerfile accept the following input build arguments: -# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "rocky8.6") - -# Pull base image -ARG DAOS_DOCKER_IMAGE_TAG=rocky8.6 -FROM daos-base:$DAOS_DOCKER_IMAGE_TAG -LABEL maintainer="daos@daos.groups.io" - -# Install DAOS agent configuration file -COPY daos_agent.yml /etc/daos/daos_agent.yml - -# Define entrypoint and cmd: -# - ENTRYPOINT for the command to run -# - CMD for the default arguments -ENTRYPOINT ["/usr/bin/bash"] -CMD ["-i"] diff --git a/utils/docker/examples/client/daos-client/el8/daos_agent.yml b/utils/docker/examples/client/daos-client/el8/daos_agent.yml deleted file mode 100644 index 179e017cdae5..000000000000 --- a/utils/docker/examples/client/daos-client/el8/daos_agent.yml +++ /dev/null @@ -1 +0,0 @@ -runtime_dir: /var/run/daos_agent diff --git a/utils/docker/examples/client/daos-client_agent/el8/daos-bash.in b/utils/docker/examples/client/daos-client_agent/el8/daos-bash.in deleted file mode 100644 index a3ec83fbce34..000000000000 --- a/utils/docker/examples/client/daos-client_agent/el8/daos-bash.in +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# set -x -set -e -o pipefail - -if [[ "$(id -u)" != "0" ]] ; then - echo "[ERROR] daos-bash can only be run as root" -fi - -CWD="$(realpath "$(dirname $0)")" - -/usr/bin/nohup /usr/bin/sudo --user=root --group=root /usr/local/sbin/run-daos_agent start < /dev/null &> /dev/null & - -exec /usr/bin/sudo --user=@DAOS_CLIENT_UNAME@ --group=@DAOS_CLIENT_GNAME@ /bin/bash "$@" diff --git a/utils/docker/examples/daos-admin/el8/.dockerignore b/utils/docker/examples/daos-admin/el8/.dockerignore new file mode 100644 index 000000000000..3bff8ec5c96b --- /dev/null +++ b/utils/docker/examples/daos-admin/el8/.dockerignore @@ -0,0 +1,4 @@ +# Ignore everything except Dockerfile configuration file +* +!daos-bash.sh +!daos_control.yml diff --git a/utils/docker/examples/daos-admin/el8/Dockerfile b/utils/docker/examples/daos-admin/el8/Dockerfile new file mode 100644 index 000000000000..0d1b5b306c3b --- /dev/null +++ b/utils/docker/examples/daos-admin/el8/Dockerfile @@ -0,0 +1,42 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# 'recipe' for building a base RHEL DAOS admin image +# +# This Dockerfile accept the following input build arguments: +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") + +# Pull base image +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-base-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" +LABEL maintainer="daos@daos.groups.io" + +# Install DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing DAOS containerization dependencies" ; \ + dnf install \ + sudo \ + xz && \ + echo "[INFO] Installing DAOS" ; \ + dnf install \ + daos-admin-${DAOS_VERSION} && \ + dnf clean all + +# Install DAOS Bash launcher +COPY daos-bash.sh /usr/local/sbin/daos-bash +RUN echo "[INSTALL] Installing DAOS Agent entry point" ; \ + chmod 755 /usr/local/sbin/daos-bash + +# Configuration of DMG +COPY daos_control.yml /etc/daos/daos_control.yml + +# Define entrypoint and cmd: +# - ENTRYPOINT for the command to run +# - CMD for the default arguments +ENTRYPOINT ["/usr/local/sbin/daos-bash"] +CMD ["-i"] diff --git a/utils/docker/examples/daos-admin/el8/daos-bash.sh b/utils/docker/examples/daos-admin/el8/daos-bash.sh new file mode 100644 index 000000000000..bfd6ff41b3bf --- /dev/null +++ b/utils/docker/examples/daos-admin/el8/daos-bash.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# set -x +set -e -o pipefail + +if [[ "$(id -u)" != "0" ]] ; then + echo "[ERROR] daos-bash can only be run as root" +fi + +mkdir -p /etc/daos/certs +chmod 755 /etc/daos/certs +tar --extract --xz --directory=/etc/daos/certs --no-same-owner --preserve-permissions --file=/run/secrets/daos_admin-certs.txz +chmod 0644 /etc/daos/certs/daosCA.crt +chmod 0644 /etc/daos/certs/admin.crt +chmod 0400 /etc/daos/certs/admin.key +chown root:root /etc/daos/certs/daosCA.crt +chown root:root /etc/daos/certs/admin.crt +chown root:root /etc/daos/certs/admin.key + +exec sudo --user=root --group=root /bin/bash "$@" diff --git a/utils/docker/examples/daos-admin/el8/daos_control.yml.example b/utils/docker/examples/daos-admin/el8/daos_control.yml.example new file mode 100644 index 000000000000..30d3806cdbdd --- /dev/null +++ b/utils/docker/examples/daos-admin/el8/daos_control.yml.example @@ -0,0 +1,16 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# This is a simple example of the DAOS manager (dmg) configuration file. For the detailed +# information about this configuration file, refer to the official example available at +# https://github.com/daos-stack/daos/blob/master/utils/config/daos_control.yml + +name: daos_server +hostlist: ['localhost'] +port: 10001 + +transport_config: + allow_insecure: false + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/admin.crt + key: /etc/daos/certs/admin.key diff --git a/utils/docker/examples/client/daos-agent/el8/.dockerignore b/utils/docker/examples/daos-agent/el8/.dockerignore similarity index 58% rename from utils/docker/examples/client/daos-agent/el8/.dockerignore rename to utils/docker/examples/daos-agent/el8/.dockerignore index 54d6ff5f71c0..6d04d4ed87ed 100644 --- a/utils/docker/examples/client/daos-agent/el8/.dockerignore +++ b/utils/docker/examples/daos-agent/el8/.dockerignore @@ -1,4 +1,4 @@ # Ignore everything except DAOS configuration file * -!daos_agent.yml.in -!run-daos_agent.in +!daos_agent.yml +!run-daos_agent.sh diff --git a/utils/docker/examples/daos-agent/el8/Dockerfile b/utils/docker/examples/daos-agent/el8/Dockerfile new file mode 100644 index 000000000000..dd0eec77ee6b --- /dev/null +++ b/utils/docker/examples/daos-agent/el8/Dockerfile @@ -0,0 +1,41 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# 'recipe' for building a base RHEL DAOS client docker image +# +# This Dockerfile accept the following input build arguments: +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") + +# Pull base image +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-base-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" +LABEL maintainer="daos@daos.groups.io" + +# Install DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing DAOS containerization dependencies" ; \ + dnf install \ + sudo \ + xz && \ + echo "[INFO] Installing DAOS" ; \ + dnf install \ + daos-client-${DAOS_VERSION} && \ + dnf clean all + +# Install DAOS agent configuration file +COPY daos_agent.yml /etc/daos/daos_agent.yml + +# Install DAOS agent launcher +COPY run-daos_agent.sh /usr/local/sbin/run-daos_agent +RUN chmod 755 /usr/local/sbin/run-daos_agent + +# Define entrypoint and cmd: +# - ENTRYPOINT for the command to run +# - CMD for the default arguments +ENTRYPOINT ["/usr/local/sbin/run-daos_agent"] +CMD ["start"] diff --git a/utils/docker/examples/daos-agent/el8/daos_agent.yml.example b/utils/docker/examples/daos-agent/el8/daos_agent.yml.example new file mode 100644 index 000000000000..13e79e3a0315 --- /dev/null +++ b/utils/docker/examples/daos-agent/el8/daos_agent.yml.example @@ -0,0 +1,25 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# This is a simple example of the DAOS agent configuration file. For the detailed information about +# this configuration file, refer to the official example available at +# https://github.com/daos-stack/daos/blob/master/utils/config/daos_agent.yml + +name: daos_server +access_points: ['localhost'] +port: 10001 +runtime_dir: /var/run/daos_agent +control_log_mask: debug +log_file: /tmp/daos_agent.log + +transport_config: + allow_insecure: false + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/agent.crt + key: /etc/daos/certs/agent.key + +fabric_ifaces: + - numa_node: 0 + devices: + - iface: eth0 + domain: eth0 diff --git a/utils/docker/examples/client/daos-agent/el8/run-daos_agent.in b/utils/docker/examples/daos-agent/el8/run-daos_agent.sh similarity index 85% rename from utils/docker/examples/client/daos-agent/el8/run-daos_agent.in rename to utils/docker/examples/daos-agent/el8/run-daos_agent.sh index 3876441b790b..048ee8024609 100644 --- a/utils/docker/examples/client/daos-agent/el8/run-daos_agent.in +++ b/utils/docker/examples/daos-agent/el8/run-daos_agent.sh @@ -7,22 +7,18 @@ if [[ "$(id -u)" != "0" ]] ; then echo "[ERROR] run-daos_agent can only be run as root" fi -CWD="$(realpath "$(dirname $0)")" - mkdir -p /var/run/daos_agent/ chmod 755 /var/run/daos_agent/ chown daos_agent:daos_agent /var/run/daos_agent/ -@DAOS_AUTH_SECTION_BEGIN@ mkdir -p /etc/daos/certs chmod 755 /etc/daos/certs tar --extract --xz --directory=/etc/daos/certs --no-same-owner --preserve-permissions --file=/run/secrets/daos_agent-certs.txz chmod 0644 /etc/daos/certs/daosCA.crt chmod 0644 /etc/daos/certs/agent.crt -chmod 0600 /etc/daos/certs/agent.key +chmod 0400 /etc/daos/certs/agent.key chown root:root /etc/daos/certs/daosCA.crt chown daos_agent:daos_agent /etc/daos/certs/agent.crt chown daos_agent:daos_agent /etc/daos/certs/agent.key -@DAOS_AUTH_SECTION_END@ exec sudo --user=daos_agent --group=daos_agent /usr/bin/daos_agent "$@" diff --git a/utils/docker/examples/client/daos-base/el8/.dockerignore b/utils/docker/examples/daos-base/el8/.dockerignore similarity index 100% rename from utils/docker/examples/client/daos-base/el8/.dockerignore rename to utils/docker/examples/daos-base/el8/.dockerignore diff --git a/utils/docker/examples/client/daos-base/el8/Dockerfile b/utils/docker/examples/daos-base/el8/Dockerfile similarity index 61% rename from utils/docker/examples/client/daos-base/el8/Dockerfile rename to utils/docker/examples/daos-base/el8/Dockerfile index 1cd0439a431c..a9f4b0542064 100644 --- a/utils/docker/examples/client/daos-base/el8/Dockerfile +++ b/utils/docker/examples/daos-base/el8/Dockerfile @@ -4,26 +4,26 @@ # 'recipe' for building a base RHEL DAOS client docker image # # This Dockerfile accept the following input build arguments: -# - RHEL_BASE_IMAGE_NAME Base docker image name to use (default "rockylinux/rockylinux") -# - RHEL_BASE_IMAGE_TAG Tag identifier of the base docker image to use (default "8.6") -# - BUST_CACHE Manage docker building cache (default undefined). To invalidate the -# cache, a random value such as the date of day shall be given. -# - DAOS_REPOS Space separated list of repos needed to install DAOS (default -# "https://packages.daos.io/v2.2/EL8/packages/x86_64/") -# - DAOS_GPG_KEYS Space separated list of GPG keys associated with DAOS repos (default -# "https://packages.daos.io/RPM-GPG-KEY") -# - DAOS_REPOS_NOAUTH Space separated list of repos to use without GPG authentication -# (optional) -# - DAOS_VERSION Version of DAOS to use (default "2.2.0-4.el8") -# - DAOS_CLIENT_UNAME User name of the client (mandatory) -# - DAOS_CLIENT_UID User id of the client (mandatory) -# - DAOS_CLIENT_GNAME Group name of the client (mandatory) -# - DAOS_CLIENT_GID Group id of the client (mandatory) +# - DAOS_CLIENT_UNAME User name of the client (mandatory) +# - DAOS_CLIENT_UID User id of the client (mandatory) +# - DAOS_CLIENT_GNAME Group name of the client (mandatory) +# - DAOS_CLIENT_GID Group id of the client (mandatory) +# - LINUX_IMAGE_NAME Base docker image name to use (default "rockylinux/rockylinux") +# - LINUX_IMAGE_TAG Tag identifier of the base docker image to use (default "8.8") +# - BUST_CACHE Manage docker building cache (default undefined). To invalidate the +# cache, a random value such as the date of day shall be given. +# - DAOS_REPOS Space separated list of repos needed to install DAOS (default +# "https://packages.daos.io/v2.4/EL8/packages/x86_64/") +# - DAOS_GPG_KEYS Space separated list of GPG keys associated with DAOS repos (default +# "https://packages.daos.io/v2.4.0/RPM-GPG-KEY-2023") +# - DAOS_REPOS_NOAUTH Space separated list of repos to use without GPG authentication +# (optional) +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") # Pull base image -ARG RHEL_BASE_IMAGE_NAME=rockylinux/rockylinux -ARG RHEL_BASE_IMAGE_TAG=8.6 -FROM $RHEL_BASE_IMAGE_NAME:$RHEL_BASE_IMAGE_TAG +ARG LINUX_IMAGE_NAME="rockylinux/rockylinux" +ARG LINUX_IMAGE_TAG="8.8" +FROM "$LINUX_IMAGE_NAME:$LINUX_IMAGE_TAG" LABEL maintainer="daos@daos.groups.io" # Base configuration of dnf and system update @@ -37,15 +37,15 @@ RUN dnf clean all && dnf update && \ dnf clean all -# Install DAOS package -# XXX NOTE XXX Variable allowing to build the image without using global --no-cache option and thus -# XXX NOTE XXX to not update all rpms. To work properly a random value such as the date of the day -# XXX NOTE XXX should be given. +# Install base DAOS package +# XXX NOTE XXX The variable §BUST_CACHE allows to build the image without using global --no-cache +# XXX NOTE XXX option and thus to not update all rpms. To work properly a random value such as the +# XXX NOTE XXX date of the day should be given. ARG BUST_CACHE="" -ARG DAOS_REPOS="https://packages.daos.io/v2.2/EL8/packages/x86_64/" -ARG DAOS_GPG_KEYS="https://packages.daos.io/RPM-GPG-KEY" +ARG DAOS_REPOS="https://packages.daos.io/v2.4/EL8/packages/x86_64/" +ARG DAOS_GPG_KEYS="https://packages.daos.io/v2.4.0/RPM-GPG-KEY-2023" ARG DAOS_REPOS_NOAUTH="" -ARG DAOS_VERSION="2.2.0-4.el8" +ARG DAOS_VERSION="2.4.0-2.el8" RUN if [ -n "$BUST_CACHE" ] ; then \ echo "[INFO] Busting cache" ; \ dnf update ; \ @@ -64,9 +64,7 @@ RUN if [ -n "$BUST_CACHE" ] ; then done && \ echo "[INFO] Installing DAOS" ; \ dnf install \ - daos-${DAOS_VERSION} \ - daos-client-${DAOS_VERSION} \ - daos-client-tests-${DAOS_VERSION} && \ + daos-${DAOS_VERSION} && \ dnf clean all # Install DAOS client user and group information @@ -79,10 +77,15 @@ RUN for it in DAOS_CLIENT_UNAME DAOS_CLIENT_UID DAOS_CLIENT_GNAME DAOS_CLIENT_GI echo "[ERROR] Docker build argument $it is not defined" ; \ exit 1 ; \ fi ; \ - done ; \ - echo "[INFO] Adding DAOS client user ${DAOS_CLIENT_UNAME}" ; \ - groupadd -g ${DAOS_CLIENT_GID} ${DAOS_CLIENT_GNAME} && \ - useradd -g ${DAOS_CLIENT_GID} -u ${DAOS_CLIENT_UID} ${DAOS_CLIENT_UNAME} + done && \ + if ! getent group ${DAOS_CLIENT_GNAME} > /dev/null 2>&1; then \ + echo "[INFO] Adding DAOS client group ${DAOS_CLIENT_GNAME}" ; \ + groupadd -g ${DAOS_CLIENT_GID} ${DAOS_CLIENT_GNAME} ; \ + fi && \ + if ! getent passwd ${DAOS_CLIENT_UNAME} > /dev/null 2>&1; then \ + echo "[INFO] Adding DAOS client user ${DAOS_CLIENT_UNAME}" ; \ + useradd -g ${DAOS_CLIENT_GID} -u ${DAOS_CLIENT_UID} ${DAOS_CLIENT_UNAME} ; \ + fi # Define entrypoint and cmd: # - ENTRYPOINT for the command to run diff --git a/utils/docker/examples/client/daos-client/el8/.dockerignore b/utils/docker/examples/daos-client/el8/.dockerignore similarity index 100% rename from utils/docker/examples/client/daos-client/el8/.dockerignore rename to utils/docker/examples/daos-client/el8/.dockerignore diff --git a/utils/docker/examples/daos-client/el8/Dockerfile b/utils/docker/examples/daos-client/el8/Dockerfile new file mode 100644 index 000000000000..cf0ceeb54c6f --- /dev/null +++ b/utils/docker/examples/daos-client/el8/Dockerfile @@ -0,0 +1,34 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# 'recipe' for building a base RHEL DAOS client docker image +# +# This Dockerfile accept the following input build arguments: +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") + +# Pull base image +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-base-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" +LABEL maintainer="daos@daos.groups.io" + +# Install base DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing DAOS" ; \ + dnf install \ + daos-client-${DAOS_VERSION} \ + daos-client-tests-${DAOS_VERSION} && \ + dnf clean all + +# Install minimal DAOS agent configuration file +COPY daos_agent.yml /etc/daos/daos_agent.yml + +# Define entrypoint and cmd: +# - ENTRYPOINT for the command to run +# - CMD for the default arguments +ENTRYPOINT ["/usr/bin/bash"] +CMD ["-i"] diff --git a/utils/docker/examples/daos-client/el8/daos_agent.yml b/utils/docker/examples/daos-client/el8/daos_agent.yml new file mode 100644 index 000000000000..973c80c8295a --- /dev/null +++ b/utils/docker/examples/daos-client/el8/daos_agent.yml @@ -0,0 +1,6 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Minimal configuration file for using the daos cli. + +runtime_dir: /var/run/daos_agent diff --git a/utils/docker/examples/client/daos-client_agent/el8/.dockerignore b/utils/docker/examples/daos-client_agent/el8/.dockerignore similarity index 79% rename from utils/docker/examples/client/daos-client_agent/el8/.dockerignore rename to utils/docker/examples/daos-client_agent/el8/.dockerignore index 5255adedbaf8..0487a262da46 100644 --- a/utils/docker/examples/client/daos-client_agent/el8/.dockerignore +++ b/utils/docker/examples/daos-client_agent/el8/.dockerignore @@ -1,3 +1,3 @@ # Ignore everything except DAOS configuration file * -!daos-bash.in +!daos-bash.sh diff --git a/utils/docker/examples/client/daos-client_agent/el8/Dockerfile b/utils/docker/examples/daos-client_agent/el8/Dockerfile similarity index 52% rename from utils/docker/examples/client/daos-client_agent/el8/Dockerfile rename to utils/docker/examples/daos-client_agent/el8/Dockerfile index 646e4c28c9b2..aa4b567e079f 100644 --- a/utils/docker/examples/client/daos-client_agent/el8/Dockerfile +++ b/utils/docker/examples/daos-client_agent/el8/Dockerfile @@ -4,25 +4,42 @@ # 'recipe' for building a base RHEL DAOS client docker image # # This Dockerfile accept the following input build arguments: -# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "rocky8.6") +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS client docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") # - DAOS_CLIENT_UNAME User name of the client (mandatory) # - DAOS_CLIENT_GNAME Group name of the client (mandatory) # Pull base image -ARG DAOS_DOCKER_IMAGE_TAG=rocky8.6 -FROM daos-agent:$DAOS_DOCKER_IMAGE_TAG +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-agent-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" LABEL maintainer="daos@daos.groups.io" +# Install base DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing DAOS containerization dependencies" ; \ + dnf install \ + sudo \ + xz && \ + echo "[INFO] Installing DAOS" ; \ + dnf install \ + daos-client-tests-${DAOS_VERSION} && \ + dnf clean all + # Install DAOS Bash launcher ARG DAOS_CLIENT_UNAME="" ARG DAOS_CLIENT_GNAME="" -COPY daos-bash.in /tmp/daos-bash.in -RUN for it in DAOS_CLIENT_UNAME DAOS_CLIENT_GNAME ; do \ +COPY daos-bash.sh /tmp/daos-bash.in +RUN echo "[INSTALL] Installing DAOS Agent entry point" ; \ + for it in DAOS_CLIENT_UNAME DAOS_CLIENT_GNAME ; do \ if eval "[[ -z \$$it ]]" ; then \ echo "[ERROR] Docker build argument $it is not defined" ; \ exit 1 ; \ fi ; \ - done ; \ + done && \ sed --regexp-extended \ --expression "s/@DAOS_CLIENT_UNAME@/${DAOS_CLIENT_UNAME}/" \ --expression "s/@DAOS_CLIENT_GNAME@/${DAOS_CLIENT_GNAME}/" \ diff --git a/utils/docker/examples/daos-client_agent/el8/daos-bash.sh b/utils/docker/examples/daos-client_agent/el8/daos-bash.sh new file mode 100644 index 000000000000..d2999fe9afe9 --- /dev/null +++ b/utils/docker/examples/daos-client_agent/el8/daos-bash.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# set -x +set -e -o pipefail + +if [[ "$(id -u)" != "0" ]] ; then + echo "[ERROR] daos-bash can only be run as root" +fi + +nohup sudo --user=root --group=root /usr/local/sbin/run-daos_agent start < /dev/null &> /dev/null & + +exec sudo --user=@DAOS_CLIENT_UNAME@ --group=@DAOS_CLIENT_GNAME@ /bin/bash "$@" diff --git a/utils/docker/examples/daos-server/el8/.dockerignore b/utils/docker/examples/daos-server/el8/.dockerignore new file mode 100644 index 000000000000..457cf6f08247 --- /dev/null +++ b/utils/docker/examples/daos-server/el8/.dockerignore @@ -0,0 +1,5 @@ +# Ignore everything except Dockerfile configuration file +* +!daos_server.yml +!run-daos_server.sh +!50_daos_limits.conf diff --git a/utils/docker/examples/daos-server/el8/50_daos_limits.conf b/utils/docker/examples/daos-server/el8/50_daos_limits.conf new file mode 100644 index 000000000000..e9e9cee8d897 --- /dev/null +++ b/utils/docker/examples/daos-server/el8/50_daos_limits.conf @@ -0,0 +1,3 @@ +* soft memlock unlimited +* hard memlock unlimited +* nofile 1048576 diff --git a/utils/docker/examples/daos-server/el8/Dockerfile b/utils/docker/examples/daos-server/el8/Dockerfile new file mode 100644 index 000000000000..d5a003f8ca3e --- /dev/null +++ b/utils/docker/examples/daos-server/el8/Dockerfile @@ -0,0 +1,45 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# 'recipe' for building a base RHEL DAOS server docker image +# +# This Dockerfile accept the following input build arguments: +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS base docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") + +# Pull base image +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-base-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" +LABEL maintainer="daos@daos.groups.io" + +# Install DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing DAOS containerization dependencies" ; \ + dnf install \ + kmod \ + sudo \ + xz && \ + echo "[INFO] Installing DAOS" ; \ + dnf install \ + daos-server-${DAOS_VERSION} && \ + dnf clean all + +# Configuration of the server +COPY daos_server.yml /etc/daos/daos_server.yml + +# Install DAOS server launcher +COPY run-daos_server.sh /usr/local/sbin/run-daos_server +RUN chmod 755 /usr/local/sbin/run-daos_server + +# Copy missing configuration file +COPY 50_daos_limits.conf /etc/security/limits.d/50_daos_limits.conf + +# Define entrypoint and cmd: +# - ENTRYPOINT for the command to run +# - CMD for the default arguments +ENTRYPOINT ["/usr/local/sbin/run-daos_server"] +CMD ["start"] diff --git a/utils/docker/examples/daos-server/el8/daos_server.yml.example b/utils/docker/examples/daos-server/el8/daos_server.yml.example new file mode 100644 index 000000000000..64d3baf7e79a --- /dev/null +++ b/utils/docker/examples/daos-server/el8/daos_server.yml.example @@ -0,0 +1,70 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# This is a simple example of the daos_server service configuration file. For the detailed +# information about this configuration file, refer to the official example available at +# https://github.com/daos-stack/daos/blob/master/utils/config/daos_server.yml + +name: daos_server +access_points: ['localhost'] +port: 10001 + +provider: ofi+tcp;ofi_rxm +socket_dir: /var/run/daos_server +disable_vfio: true + +transport_config: + allow_insecure: false + client_cert_dir: /etc/daos/certs/clients + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/server.crt + key: /etc/daos/certs/server.key + +helper_log_file: /tmp/daos_server_helper.log +control_log_file: /tmp/daos_server.log +control_log_mask: INFO +control_metadata: + path: /var/db/daos_server + +engines: + - pinned_numa_node: 0 + targets: 16 + nr_xs_helpers: 4 + fabric_iface: "eth0" + fabric_iface_port: 31416 + log_file: /tmp/daos_engine-0.log + log_mask: INFO + storage: + - class: ram + scm_mount: /mnt/daos0 + scm_size: 64 + - class: nvme + bdev_list: ['0000:41:00.0', '0000:42:00.0', '0000:43:00.0', '0000:44:00.0'] + bdev_roles: + - data + - class: nvme + bdev_list: ['0000:45:00.0'] + bdev_roles: + - meta + - wal + + - pinned_numa_node: 1 + targets: 16 + nr_xs_helpers: 4 + fabric_iface: "eth1" + fabric_iface_port: 32416 + log_file: /tmp/daos_engine-1.log + log_mask: INFO + storage: + - class: ram + scm_mount: /mnt/daos1 + scm_size: 64 + - class: nvme + bdev_list: ['0000:81:00.0', '0000:82:00.0', '0000:83:00.0', '0000:84:00.0'] + bdev_roles: + - data + - class: nvme + bdev_list: ['0000:85:00.0'] + bdev_roles: + - meta + - wal diff --git a/utils/docker/examples/daos-server/el8/run-daos_server.sh b/utils/docker/examples/daos-server/el8/run-daos_server.sh new file mode 100644 index 000000000000..c9e4f1f15a83 --- /dev/null +++ b/utils/docker/examples/daos-server/el8/run-daos_server.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# set -x +set -e -o pipefail + +if [[ "$(id -u)" != "0" ]] ; then + echo "[ERROR] run-daos_server can only be run as root" +fi + +mkdir -p /var/run/daos_server/ +chmod 755 /var/run/daos_server/ +chown root:root /var/run/daos_server/ + +mkdir -p /etc/daos/certs/clients +chmod 755 /etc/daos/certs +chmod 700 /etc/daos/certs/clients +tar --extract --xz --directory=/etc/daos/certs --no-same-owner --preserve-permissions --file=/run/secrets/daos_server-certs.txz +mv /etc/daos/certs/agent.crt /etc/daos/certs/clients/agent.crt +chmod 644 /etc/daos/certs/daosCA.crt +chmod 644 /etc/daos/certs/server.crt +chmod 400 /etc/daos/certs/server.key +chmod 644 /etc/daos/certs/clients/agent.crt +chown root:root /etc/daos/certs/daosCA.crt +chown root:root /etc/daos/certs/server.crt +chown root:root /etc/daos/certs/server.key +chown root:root /etc/daos/certs/clients/agent.crt + +cd /var/run/daos_server/ +exec sudo --user=root --group=root /usr/bin/daos_server "$@" diff --git a/utils/docker/examples/daos-spdk_setup/el8/.dockerignore b/utils/docker/examples/daos-spdk_setup/el8/.dockerignore new file mode 100644 index 000000000000..3e7f93662839 --- /dev/null +++ b/utils/docker/examples/daos-spdk_setup/el8/.dockerignore @@ -0,0 +1,2 @@ +# Ignore everything except Dockerfile configuration file +* diff --git a/utils/docker/examples/daos-spdk_setup/el8/Dockerfile b/utils/docker/examples/daos-spdk_setup/el8/Dockerfile new file mode 100644 index 000000000000..47169d8da74a --- /dev/null +++ b/utils/docker/examples/daos-spdk_setup/el8/Dockerfile @@ -0,0 +1,33 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# 'recipe' for building a base RHEL SPDK setup script wrapper +# +# This Dockerfile accept the following input build arguments: +# - LINUX_DISTRO Linux distribution identifier (default "el8") +# - DAOS_DOCKER_IMAGE_NSP Namespace identifier of the base DAOS docker image (default "daos") +# - DAOS_DOCKER_IMAGE_TAG Tag identifier of the DAOS base docker image (default "v2.4.0") +# - DAOS_VERSION Version of DAOS to use (default "2.4.0-2.el8") + +# Pull base image +ARG LINUX_DISTRO="el8" +ARG DAOS_DOCKER_IMAGE_NSP="daos" +ARG DAOS_DOCKER_IMAGE_TAG="v2.4.0" +FROM "$DAOS_DOCKER_IMAGE_NSP/daos-base-$LINUX_DISTRO:$DAOS_DOCKER_IMAGE_TAG" +LABEL maintainer="daos@daos.groups.io" + +# Install DAOS package +ARG DAOS_VERSION="2.4.0-2.el8" +RUN echo "[INFO] Installing SPDK tools dependencies" ; \ + dnf install \ + kmod && \ + echo "[INFO] Installing SPDK tools" ; \ + dnf install \ + spdk-tools && \ + dnf clean all + +# Define entrypoint and cmd: +# - ENTRYPOINT for the command to run +# - CMD for the default arguments +ENTRYPOINT ["/usr/share/spdk/scripts/setup.sh"] +CMD ["status"] diff --git a/utils/docker/examples/client/deploy-docker_stack.sh b/utils/docker/examples/deploy-docker_stack.sh similarity index 59% rename from utils/docker/examples/client/deploy-docker_stack.sh rename to utils/docker/examples/deploy-docker_stack.sh index b4ea8358b227..92f7eed6dc5e 100644 --- a/utils/docker/examples/client/deploy-docker_stack.sh +++ b/utils/docker/examples/deploy-docker_stack.sh @@ -3,11 +3,11 @@ # set -x set -e -o pipefail -CWD="$(realpath "$(dirname "$0")")" +CWD="$(realpath "${0%/*}")" set -a # shellcheck disable=SC1091 source "$CWD/.env" set +a -docker stack up -c "$1" daos_stack +docker stack deploy -c "$1" daos_stack diff --git a/utils/docker/examples/docker-compose.admin.yml b/utils/docker/examples/docker-compose.admin.yml new file mode 100644 index 000000000000..1eb25f3b39f6 --- /dev/null +++ b/utils/docker/examples/docker-compose.admin.yml @@ -0,0 +1,26 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Docker Compose file allowing to build and deploy a containerized DAOS system + +version: "3.8" + +services: + + daos_admin: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-admin-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" + build: + context: "daos-admin/${LINUX_DISTRO}" + args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" + - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" + tty: true + secrets: + - source: daos_admin-certs + target: daos_admin-certs.txz + +secrets: + daos_admin-certs: + file: "${DAOS_ADMIN_CERTS_TXZ}" diff --git a/utils/docker/examples/client/docker-compose.daos_base.yml b/utils/docker/examples/docker-compose.base.yml similarity index 64% rename from utils/docker/examples/client/docker-compose.daos_base.yml rename to utils/docker/examples/docker-compose.base.yml index 26ee6f186e47..cc55ae1fccd6 100644 --- a/utils/docker/examples/client/docker-compose.daos_base.yml +++ b/utils/docker/examples/docker-compose.base.yml @@ -1,24 +1,24 @@ # Copyright 2021-2023 Intel Corporation # All rights reserved. # -# Docker Compose file allowing to build and deploy locally a DAOS virtual cluster +# Docker Compose file allowing to build and deploy a containerized DAOS system version: "3.8" services: daos_base: - image: "daos-base:${DAOS_DOCKER_IMAGE_TAG}" + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-base-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" build: - context: "daos-base/el8" + context: "daos-base/${LINUX_DISTRO}" args: - "DAOS_CLIENT_UNAME=${DAOS_CLIENT_UNAME}" - "DAOS_CLIENT_UID=${DAOS_CLIENT_UID}" - "DAOS_CLIENT_GNAME=${DAOS_CLIENT_GNAME}" - "DAOS_CLIENT_GID=${DAOS_CLIENT_GID}" - - "RHEL_BASE_IMAGE_NAME=${RHEL_BASE_IMAGE_NAME}" - - "RHEL_BASE_IMAGE_TAG=${RHEL_BASE_IMAGE_TAG}" - "BUST_CACHE=${BUST_CACHE}" + - "LINUX_IMAGE_NAME=${LINUX_IMAGE_NAME}" + - "LINUX_IMAGE_TAG=${LINUX_IMAGE_TAG}" - "DAOS_REPOS=${DAOS_REPOS}" - "DAOS_GPG_KEYS=${DAOS_GPG_KEYS}" - "DAOS_REPOS_NOAUTH=${DAOS_REPOS_NOAUTH}" diff --git a/utils/docker/examples/client/docker-compose.daos_client.standalone.yml b/utils/docker/examples/docker-compose.client_bm.yml similarity index 55% rename from utils/docker/examples/client/docker-compose.daos_client.standalone.yml rename to utils/docker/examples/docker-compose.client_bm.yml index bc6d909ed7e0..bd9f9b3ccf3d 100644 --- a/utils/docker/examples/client/docker-compose.daos_client.standalone.yml +++ b/utils/docker/examples/docker-compose.client_bm.yml @@ -1,18 +1,21 @@ # Copyright 2021-2023 Intel Corporation # All rights reserved. # -# Docker Compose file allowing to build and deploy locally a DAOS virtual cluster +# Docker Compose file allowing to build and deploy a containerized DAOS system version: "3.8" services: - daos_client: - image: "daos-client:${DAOS_DOCKER_IMAGE_TAG}" + daos_client_bm: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-client_bm-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" build: - context: "daos-client/el8" + context: "daos-client/${LINUX_DISTRO}" args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" tty: true network_mode: host pid: host diff --git a/utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml b/utils/docker/examples/docker-compose.client_gt.yml similarity index 58% rename from utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml rename to utils/docker/examples/docker-compose.client_gt.yml index 85b58bf1543b..b45e5388c702 100644 --- a/utils/docker/examples/client/docker-compose.daos_client_agent.gathered.yml +++ b/utils/docker/examples/docker-compose.client_gt.yml @@ -1,18 +1,21 @@ # Copyright 2021-2023 Intel Corporation # All rights reserved. # -# Docker Compose file allowing to build and deploy locally a DAOS virtual cluster +# Docker Compose file allowing to build and deploy a containerized DAOS system version: "3.8" services: - daos_client_agent: - image: "daos-client_agent:${DAOS_DOCKER_IMAGE_TAG}" + daos_client_gt: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-client_gt-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" build: - context: "daos-client_agent/el8" + context: "daos-client_agent/${LINUX_DISTRO}" args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" - "DAOS_CLIENT_UNAME=${DAOS_CLIENT_UNAME}" - "DAOS_CLIENT_GNAME=${DAOS_CLIENT_GNAME}" tty: true @@ -25,7 +28,6 @@ services: - source: daos_agent-certs target: daos_agent-certs.txz - secrets: daos_agent-certs: file: "${DAOS_AGENT_CERTS_TXZ}" diff --git a/utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml b/utils/docker/examples/docker-compose.client_sa.yml similarity index 60% rename from utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml rename to utils/docker/examples/docker-compose.client_sa.yml index 79d1d6c6d214..2e0626996aed 100644 --- a/utils/docker/examples/client/docker-compose.daos_client_agent.standalone.yml +++ b/utils/docker/examples/docker-compose.client_sa.yml @@ -1,25 +1,21 @@ # Copyright 2021-2023 Intel Corporation # All rights reserved. # -# Docker Compose file allowing to build and deploy locally a DAOS virtual cluster +# Docker Compose file allowing to build and deploy a containerized DAOS system version: "3.8" services: daos_agent: - image: "daos-agent:${DAOS_DOCKER_IMAGE_TAG}" + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-agent-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" build: - context: "daos-agent/el8" + context: "daos-agent/${LINUX_DISTRO}" args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" - - "DAOS_ACCESS_POINTS=${DAOS_ACCESS_POINTS}" - - "DAOS_PORT=${DAOS_PORT}" - - "DAOS_IFACE_CFG=${DAOS_IFACE_CFG}" - - "DAOS_IFACE_NUMA_NODE=${DAOS_IFACE_NUMA_NODE}" - - "DAOS_IFACE_NAME=${DAOS_IFACE_NAME}" - - "DAOS_IFACE_DOMAIN_NAME=${DAOS_IFACE_DOMAIN_NAME}" - - "DAOS_AUTH=${DAOS_AUTH}" + - "DAOS_VERSION=${DAOS_VERSION}" network_mode: host pid: host volumes: @@ -32,12 +28,15 @@ services: - source: daos_agent-certs target: daos_agent-certs.txz - daos_client: - image: "daos-client:${DAOS_DOCKER_IMAGE_TAG}" + daos_client_sa: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-client_sa-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" build: - context: "daos-client/el8" + context: "daos-client/${LINUX_DISTRO}" args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" tty: true network_mode: host pid: host @@ -55,11 +54,9 @@ services: depends_on: - daos_agent - volumes: daos_agent-socket: - secrets: daos_agent-certs: file: "${DAOS_AGENT_CERTS_TXZ}" diff --git a/utils/docker/examples/docker-compose.server.yml b/utils/docker/examples/docker-compose.server.yml new file mode 100644 index 000000000000..812e376c7ad7 --- /dev/null +++ b/utils/docker/examples/docker-compose.server.yml @@ -0,0 +1,47 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Docker Compose file allowing to build and deploy a containerized DAOS system + +version: "3.8" + +services: + + daos_server: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-server-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" + build: + context: "daos-server/${LINUX_DISTRO}" + args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" + - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" + # XXX Seems to not be possible to use uio without privileged mode + # https://github.com/moby/moby/issues/22825 + privileged: true + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: 1048576 + network_mode: host + volumes: + - type: bind + read_only: false + source: /sys/devices/system/node + target: /sys/devices/system/node + - type: bind + read_only: false + source: /lib/modules + target: /lib/modules + - type: bind + read_only: false + source: /dev/hugepages + target: /dev/hugepages + secrets: + - source: daos_server-certs + target: daos_server-certs.txz + +secrets: + daos_server-certs: + file: "${DAOS_SERVER_CERTS_TXZ}" diff --git a/utils/docker/examples/docker-compose.spdk_setup.yml b/utils/docker/examples/docker-compose.spdk_setup.yml new file mode 100644 index 000000000000..009450d2e08f --- /dev/null +++ b/utils/docker/examples/docker-compose.spdk_setup.yml @@ -0,0 +1,31 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Docker Compose file allowing to build and deploy a containerized DAOS system + +version: "3.8" + +services: + + daos_spdk_setup: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-spdk_setup-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" + build: + context: "daos-spdk_setup/${LINUX_DISTRO}" + args: + - "LINUX_DISTRO=${LINUX_DISTRO}" + - "DAOS_DOCKER_IMAGE_NSP=${DAOS_DOCKER_IMAGE_NSP}" + - "DAOS_DOCKER_IMAGE_TAG=${DAOS_DOCKER_IMAGE_TAG}" + - "DAOS_VERSION=${DAOS_VERSION}" + # XXX Seems to not be possible to use uio without privileged mode + # https://github.com/moby/moby/issues/22825 + privileged: true + network_mode: none + volumes: + - type: bind + read_only: false + source: /lib/modules + target: /lib/modules + - type: bind + read_only: false + source: /dev/hugepages + target: /dev/hugepages diff --git a/utils/docker/examples/docker-stack.admin.yml b/utils/docker/examples/docker-stack.admin.yml new file mode 100644 index 000000000000..cdbb53c8244f --- /dev/null +++ b/utils/docker/examples/docker-stack.admin.yml @@ -0,0 +1,21 @@ +# Copyright 2021-2023 Intel Corporation +# All rights reserved. +# +# Docker Compose file allowing to deploy a DAOS admin service with Docker stack + +version: "3.8" + +services: + daos_admin: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-admin-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" + tty: true + secrets: + - source: daos_admin-certs + target: daos_admin-certs.txz + uid: '0' + gid: '0' + mode: 0600 + +secrets: + daos_admin-certs: + external: true diff --git a/utils/docker/examples/client/docker-stack.daos_client_agent.gathered.yml b/utils/docker/examples/docker-stack.client_gt.yml similarity index 66% rename from utils/docker/examples/client/docker-stack.daos_client_agent.gathered.yml rename to utils/docker/examples/docker-stack.client_gt.yml index f25329e605eb..ea05233dcded 100644 --- a/utils/docker/examples/client/docker-stack.daos_client_agent.gathered.yml +++ b/utils/docker/examples/docker-stack.client_gt.yml @@ -1,14 +1,13 @@ # Copyright 2021-2023 Intel Corporation # All rights reserved. # -# Docker Compose file allowing to build and deploy locally a DAOS virtual cluster +# Docker Compose file allowing to deploy a DAOS client service with Docker stack version: "3.8" services: - - daos_client_agent: - image: "daos-client_agent:${DAOS_DOCKER_IMAGE_TAG}" + daos_client_gt: + image: "${DAOS_DOCKER_IMAGE_NSP}/daos-client_gt-${LINUX_DISTRO}:${DAOS_DOCKER_IMAGE_TAG}" tty: true networks: - docker_host @@ -19,12 +18,10 @@ services: gid: '0' mode: 0600 - secrets: daos_agent-certs: external: true - networks: docker_host: name: "host" diff --git a/utils/node_local_test.py b/utils/node_local_test.py index bb45ff361e25..74a17e6b8447 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1446,7 +1446,8 @@ def stop(self, ignore_einval=False): print('Stopping fuse') - self.run_query() + if self.container: + self.run_query() ret = umount(self.dir) if ret: umount(self.dir, background=True) @@ -1535,12 +1536,17 @@ def run_query(self, use_json=False, quiet=False): print(rc) return rc - def check_usage(self, ino=None, inodes=None, open_files=None, pools=None, containers=None): + def check_usage(self, ino=None, inodes=None, open_files=None, pools=None, containers=None, + qpath=None): """Query and verify the dfuse statistics. Returns the raw numbers in a dict. """ - cmd = ['filesystem', 'query', self.dir] + cmd = ['filesystem', 'query'] + if qpath: + cmd.append(qpath) + else: + cmd.append(self.dir) if ino is not None: cmd.extend(['--inode', str(ino)]) @@ -1567,7 +1573,7 @@ def _evict_path(self, path): return rc.json['response'] - def evict_and_wait(self, paths): + def evict_and_wait(self, paths, qpath=None): """Evict a number of paths from dfuse""" inodes = [] for path in paths: @@ -1579,7 +1585,7 @@ def evict_and_wait(self, paths): for inode in inodes: found = True while found: - rc = self.check_usage(inode) + rc = self.check_usage(inode, qpath=qpath) print(rc) found = rc['resident'] if not found: @@ -1886,10 +1892,11 @@ class needs_dfuse_with_opt(): # pylint: disable=too-few-public-methods - def __init__(self, caching=None, wbcache=True, single_threaded=False): + def __init__(self, caching=None, wbcache=True, single_threaded=False, dfuse_inval=True): self.caching = caching self.wbcache = wbcache self.single_threaded = single_threaded + self.dfuse_inval = dfuse_inval def __call__(self, method): """Wrapper function""" @@ -1901,9 +1908,18 @@ def _helper(obj): if obj.call_index == 0: caching = True obj.needs_more = True - obj.test_name = f'{method.__name__}_with_caching' + obj.test_name = f'{method.__name__}_caching_on' else: caching = False + obj.test_name = f'{method.__name__}_caching_off' + + if not self.dfuse_inval: + assert self.caching is True + cont_attrs = {'dfuse-attr-time': '5m', + 'dfuse-dentry-time': '5m', + 'dfuse-dentry-dir-time': '5m', + 'dfuse-ndentry-time': '5m'} + obj.container.set_attrs(cont_attrs) obj.dfuse = DFuse(obj.server, obj.conf, @@ -2812,19 +2828,35 @@ def test_uns_create(self): rc = self.dfuse.run_query(use_json=True) assert rc.returncode == 0 - @needs_dfuse + @needs_dfuse_with_opt(dfuse_inval=False, caching=True) def test_uns_link(self): - """Simple test to create a container then create a path for it in dfuse""" + """Test to create a container then create a path for it in dfuse. + + Runs with dfuse already started, creates two new containers without links. + + Links one container into UNS and then destroys it through the link. + + Links the second container into UNS and then destroys it through the link, but checking + the inode counts before and after. + + This test requires caching attributes to be set on the second container so that it does + not get evicted before the inode count check. + """ + # Create a new container which not linked container1 = create_cont(self.conf, self.pool, ctype="POSIX", label='mycont_uns_link1') cmd = ['cont', 'query', self.pool.id(), container1.id()] rc = run_daos_cmd(self.conf, cmd) assert rc.returncode == 0 + # Create a second new container which is not linked container2 = create_cont(self.conf, self.pool, ctype="POSIX", label='mycont_uns_link2') - cmd = ['cont', 'query', self.pool.id(), container2.id()] - rc = run_daos_cmd(self.conf, cmd) - assert rc.returncode == 0 + cont_attrs = {'dfuse-attr-time': '5m', + 'dfuse-dentry-time': '5m', + 'dfuse-dentry-dir-time': '5m', + 'dfuse-ndentry-time': '5m'} + container2.set_attrs(cont_attrs) + # Link and then destroy the first container path = join(self.dfuse.dir, 'uns_link1') cmd = ['cont', 'link', self.pool.id(), 'mycont_uns_link1', '--path', path] rc = run_daos_cmd(self.conf, cmd) @@ -2835,7 +2867,10 @@ def test_uns_link(self): print(os.listdir(path)) cmd = ['cont', 'destroy', '--path', path] rc = run_daos_cmd(self.conf, cmd) + assert rc.returncode == 0 + # Link and then destroy the second container but check inode count before and after + # destroying. path = join(self.dfuse.dir, 'uns_link2') cmd = ['cont', 'link', self.pool.id(), container2.id(), '--path', path] rc = run_daos_cmd(self.conf, cmd) @@ -3287,6 +3322,12 @@ def test_uns_basic(self): server = self.server conf = self.conf + cont_attrs = {'dfuse-attr-time': '5m', + 'dfuse-dentry-time': '5m', + 'dfuse-dentry-dir-time': '5m', + 'dfuse-ndentry-time': '5m'} + container.set_attrs(cont_attrs) + # Start dfuse on the container. dfuse = DFuse(server, conf, container=container, caching=False) dfuse.start('uns-0') @@ -3308,8 +3349,10 @@ def test_uns_basic(self): if dfuse.stop(): self.fatal_errors = True + uns_container.set_attrs(cont_attrs) + print('Trying UNS') - dfuse = DFuse(server, conf, caching=False) + dfuse = DFuse(server, conf, caching=True) dfuse.start('uns-1') # List the root container. @@ -3323,6 +3366,9 @@ def test_uns_basic(self): print('Inserting entry point') uns_container_2 = create_cont(conf, pool=self.pool, path=uns_path) + uns_container_2.set_attrs(cont_attrs) + dfuse.evict_and_wait([uns_path], qpath=join(dfuse.dir, pool, container.uuid)) + # List the root container again. print(os.listdir(join(dfuse.dir, pool, container.uuid))) @@ -3345,7 +3391,7 @@ def test_uns_basic(self): if dfuse.stop(): self.fatal_errors = True print('Trying UNS with previous cont') - dfuse = DFuse(server, conf, caching=False) + dfuse = DFuse(server, conf, caching=True) dfuse.start('uns-3') second_path = join(dfuse.dir, pool, uns_container.uuid) @@ -5407,16 +5453,40 @@ def _prep(self): fatal_errors = False + max_load_avg = 100 + # Now run all iterations in parallel up to max_child. Iterations will be launched # in order but may not finish in order, rather they are processed in the order they # finish. After each repetition completes then check for re-launch new processes # to keep the pipeline full. while not finished or active: + load_avg, _, _ = os.getloadavg() + + # DAOS-14164 Back off on launching tests if the system is loaded. If the node is above + # a certain load average then pause and lower the level of expected parallelism. If the + # node is close to the maximum then do not decrease the count but put preference to + # completing running tests and only launch one test before re-sampling the load average. + + start_this_iteration = 10 + if max_child > 1 and load_avg > 0.8 * max_load_avg: + start_this_iteration = 1 + if load_avg > max_load_avg: + if max_count < max_child: + max_child -= 5 + else: + max_child -= 1 + max_child = max(max_child, 20) + print(f"High load average of {load_avg}, " + f"pausing and decreasing parallelism to {max_child} {max_count}") + if max_child > 20: + time.sleep(2) + if not finished: - while len(active) < max_child: + while start_this_iteration > 0 and len(active) < max_child: active.append(self._run_cmd(fid)) fid += 1 + start_this_iteration -= 1 if len(active) > max_count: max_count = len(active) @@ -5438,7 +5508,7 @@ def _prep(self): break print(f'Completed, fid {fid}') - print(f'Max in flight {max_count}') + print(f'Max in flight {max_count}/{max_child}') if to_rerun: print(f'Number of indexes to re-run {len(to_rerun)}') diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 6c0e1184d638..b6e2a20b6d22 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.5.100 -Release: 13%{?relval}%{?dist} +Release: 14%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -381,6 +381,7 @@ getent passwd daos_server >/dev/null || useradd -s /sbin/nologin -r -g daos_serv %preun server %systemd_preun %{server_svc_name} +# all of these macros are empty on EL so keep rpmlint happy %if (0%{?suse_version} > 0) %postun server %{?run_ldconfig} @@ -406,7 +407,6 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %files %defattr(-, root, root, -) %doc README.md -%{_sysconfdir}/ld.so.conf.d/daos.conf %dir %attr(0755,root,root) %{conf_dir}/certs %config(noreplace) %{conf_dir}/memcheck-cart.supp %dir %{conf_dir} @@ -431,6 +431,7 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %attr(2755,root,daos_server) %{_bindir}/daos_server %{_bindir}/daos_engine %{_bindir}/daos_metrics +%{_sysconfdir}/ld.so.conf.d/daos.conf %dir %{_libdir}/daos_srv %{_libdir}/daos_srv/libcont.so %{_libdir}/daos_srv/libdtx.so @@ -518,7 +519,6 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %config(noreplace) %{conf_dir}/fault-inject-cart.yaml %{_bindir}/fault_status %{_bindir}/crt_launch -# For avocado tests %{_bindir}/daos_perf %{_bindir}/daos_racer %{_bindir}/daos_test @@ -582,6 +582,13 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Fri Feb 02 2024 Ashley M. Pittman 2.3.103-15 +- Update pydaos install process +- Add a depency from daos-client-tests to daos-devel and gdb + +* Tue Jan 09 2024 Brian J. Murrell 2.5.100-14 +- Move /etc/ld.so.conf.d/daos.conf to daos-server sub-package + * Wed Dec 06 2023 Brian J. Murrell 2.5.100-13 - Update for EL 8.8 and Leap 15.5 - Update raft to 0.10.1-2.411.gefa15f4 diff --git a/utils/utest.yaml b/utils/utest.yaml index 88ffc5b5cd6f..31746405edf0 100644 --- a/utils/utest.yaml +++ b/utils/utest.yaml @@ -24,16 +24,20 @@ tests: - cmd: ["src/common/tests/btree.sh", "perf"] - cmd: ["src/common/tests/btree.sh", "perf", "direct"] + - cmd: ["src/common/tests/btree.sh", "perf", "direct", "emb"] - cmd: ["src/common/tests/btree.sh", "perf", "ukey"] - cmd: ["src/common/tests/btree.sh", "dyn", "perf"] - cmd: ["src/common/tests/btree.sh", "dyn", "perf", "ukey"] + - cmd: ["src/common/tests/btree.sh", "dyn", "perf", "emb"] - name: btree tests: - cmd: ["src/common/tests/btree.sh"] - cmd: ["src/common/tests/btree.sh", "direct"] + - cmd: ["src/common/tests/btree.sh", "direct", "emb"] - cmd: ["src/common/tests/btree.sh", "ukey"] - cmd: ["src/common/tests/btree.sh", "dyn", "ukey"] - cmd: ["src/common/tests/btree.sh", "dyn"] + - cmd: ["src/common/tests/btree.sh", "dyn", "emb"] - name: drpc base: "BUILD_DIR" tests: