From b2a53b1f39a6e2cd8c40b0a13515eaf1de7ae620 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 11 Jun 2024 16:36:48 +0100 Subject: [PATCH] DAOS-15966 control: Remove dmg storage query device-health (#14508) Remove unnecessary command alias in favor of dmg storage query list-devices --health. Signed-off-by: Tom Nabarro --- src/bio/README.md | 6 ++-- src/common/tests_dmg_helpers.c | 4 +-- src/control/cmd/dmg/json_test.go | 4 +-- src/control/cmd/dmg/pretty/storage_test.go | 2 +- src/control/cmd/dmg/storage_query.go | 25 +++---------- src/control/cmd/dmg/storage_query_test.go | 36 +++++++------------ src/control/lib/support/log.go | 2 +- src/control/server/ctl_smd_rpc_test.go | 10 +++--- src/include/daos/tests_lib.h | 4 +-- .../ftest/checksum/csum_error_logging.py | 16 +++++---- src/tests/ftest/nvme/health.py | 11 +++--- src/tests/ftest/util/dmg_utils.py | 15 -------- src/tests/ftest/util/dmg_utils_base.py | 14 ++------ 13 files changed, 50 insertions(+), 99 deletions(-) diff --git a/src/bio/README.md b/src/bio/README.md index 2439352eace..1dcdae11c97 100644 --- a/src/bio/README.md +++ b/src/bio/README.md @@ -75,7 +75,7 @@ The device owner xstream is responsible for maintaining anf updating all device The DAOS data plane will monitor NVMe SSDs every 60 seconds, including updating the health stats with current values, checking current device states, and making any necessary blobstore/device state transitions. Once a FAULTY state transition has occurred, the monitoring period will be reduced to 10 seconds to allow for quicker transitions and finer-grained monitoring until the device is fully evicted. Useful admin command to query device health: - - dmg storage query device-health [used to query SSD health stats] + - dmg storage query list-devices --health [used to query SSD health stats] While monitoring this health data, an admin can now make the determination to manually evict a faulty device. This data will also be used to set the faulty device criteria for automatic SSD eviction (available in a future release). @@ -179,10 +179,10 @@ Pools ``` -- Query Device Health Data: **$dmg storage query device-health** +- Query Device Health Data: **$dmg storage query list-devices --health** ``` -$ dmg storage query device-health --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 +$ dmg storage query list-devices --health --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 Devices: UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8d:00.0] Targets:[0] Rank:0 State:NORMAL diff --git a/src/common/tests_dmg_helpers.c b/src/common/tests_dmg_helpers.c index ef02790c6b6..7a7ee049b31 100644 --- a/src/common/tests_dmg_helpers.c +++ b/src/common/tests_dmg_helpers.c @@ -1439,8 +1439,8 @@ dmg_storage_query_device_health(const char *dmg_config_file, char *host, if (args == NULL) D_GOTO(out, rc = -DER_NOMEM); - rc = daos_dmg_json_pipe("storage query device-health ", dmg_config_file, - args, argcount, &dmg_out); + rc = daos_dmg_json_pipe("storage query list-devices --health ", dmg_config_file, args, + argcount, &dmg_out); if (rc != 0) { D_ERROR("dmg command failed\n"); goto out_json; diff --git a/src/control/cmd/dmg/json_test.go b/src/control/cmd/dmg/json_test.go index a332dec368b..893a312509a 100644 --- a/src/control/cmd/dmg/json_test.go +++ b/src/control/cmd/dmg/json_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -75,8 +75,6 @@ func TestDmg_JsonOutput(t *testing.T) { case "storage nvme-add-device": testArgs = append(testArgs, "-l", "foo.com", "-a", test.MockPCIAddr(), "-e", "0") - case "storage query device-health": - testArgs = append(testArgs, "-u", test.MockUUID()) case "storage set nvme-faulty": testArgs = append(testArgs, "--force", "-u", test.MockUUID()) case "storage replace nvme": diff --git a/src/control/cmd/dmg/pretty/storage_test.go b/src/control/cmd/dmg/pretty/storage_test.go index 1424cb92955..ed6be6ad41f 100644 --- a/src/control/cmd/dmg/pretty/storage_test.go +++ b/src/control/cmd/dmg/pretty/storage_test.go @@ -1585,7 +1585,7 @@ host1 No devices found `, }, - "device-health": { + "list-devices; with health": { noPools: true, hsm: mockHostStorageMap(t, &mockHostStorage{ diff --git a/src/control/cmd/dmg/storage_query.go b/src/control/cmd/dmg/storage_query.go index adbbc34d32a..2075350cebe 100644 --- a/src/control/cmd/dmg/storage_query.go +++ b/src/control/cmd/dmg/storage_query.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -70,26 +70,9 @@ func (cmd *smdQueryCmd) makeRequest(ctx context.Context, req *control.SmdQueryRe // storageQueryCmd is the struct representing the storage query subcommand type storageQueryCmd struct { - DeviceHealth devHealthQueryCmd `command:"device-health" description:"Query the device health"` - ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools with NVMe on the server"` - ListDevices listDevicesQueryCmd `command:"list-devices" description:"List storage devices on the server"` - Usage usageQueryCmd `command:"usage" description:"Show SCM & NVMe storage space utilization per storage server"` -} - -type devHealthQueryCmd struct { - smdQueryCmd - UUID string `short:"u" long:"uuid" description:"Device UUID. All devices queried if arg not set"` -} - -func (cmd *devHealthQueryCmd) Execute(_ []string) error { - ctx := cmd.MustLogCtx() - req := &control.SmdQueryReq{ - OmitPools: true, - IncludeBioHealth: true, - Rank: ranklist.NilRank, - UUID: cmd.UUID, - } - return cmd.makeRequest(ctx, req) + ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools with NVMe on the server"` + ListDevices listDevicesQueryCmd `command:"list-devices" description:"List storage devices on the server"` + Usage usageQueryCmd `command:"usage" description:"Show SCM & NVMe storage space utilization per storage server"` } type listDevicesQueryCmd struct { diff --git a/src/control/cmd/dmg/storage_query_test.go b/src/control/cmd/dmg/storage_query_test.go index 3cc082876a5..d07685490ff 100644 --- a/src/control/cmd/dmg/storage_query_test.go +++ b/src/control/cmd/dmg/storage_query_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -18,27 +18,6 @@ import ( func TestStorageQueryCommands(t *testing.T) { runCmdTests(t, []cmdTest{ - { - "per-server metadata device health query", - "storage query device-health --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", - printRequest(t, &control.SmdQueryReq{ - Rank: ranklist.NilRank, - OmitPools: true, - IncludeBioHealth: true, - UUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d", - }), - nil, - }, - { - "per-server metadata device health query (missing uuid)", - "storage query device-health", - printRequest(t, &control.SmdQueryReq{ - Rank: ranklist.NilRank, - OmitPools: true, - IncludeBioHealth: true, - }), - nil, - }, { "per-server metadata query pools", "storage query list-pools", @@ -77,7 +56,7 @@ func TestStorageQueryCommands(t *testing.T) { nil, }, { - "per-server metadata query devices (include health)", + "per-server metadata device query health", "storage query list-devices --health", printRequest(t, &control.SmdQueryReq{ Rank: ranklist.NilRank, @@ -86,6 +65,17 @@ func TestStorageQueryCommands(t *testing.T) { }), nil, }, + { + "per-server metadata device query health (by uuid)", + "storage query list-devices --health --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d", + printRequest(t, &control.SmdQueryReq{ + Rank: ranklist.NilRank, + OmitPools: true, + IncludeBioHealth: true, + UUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d", + }), + nil, + }, { "per-server metadata query devices (show only evicted)", "storage query list-devices --show-evicted", diff --git a/src/control/lib/support/log.go b/src/control/lib/support/log.go index bd6cb696cf1..95257f7bf03 100644 --- a/src/control/lib/support/log.go +++ b/src/control/lib/support/log.go @@ -86,7 +86,7 @@ const ( ) const DmgListDeviceCmd = "dmg storage query list-devices" -const DmgDeviceHealthCmd = "dmg storage query device-health" +const DmgDeviceHealthCmd = "dmg storage query list-devices --health" var DmgCmd = []string{ "dmg system get-prop", diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index a37c6548c68..6378e81d23c 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -485,7 +485,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, expErr: daos.Busy, }, - "device-health": { + "list-devices; with health": { req: &ctlpb.SmdQueryReq{ OmitPools: true, Rank: uint32(ranklist.NilRank), @@ -531,7 +531,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, }, }, - "device-health; no uuid in request": { + "list-devices; with health; no uuid in request": { req: &ctlpb.SmdQueryReq{ OmitPools: true, Rank: uint32(ranklist.NilRank), @@ -589,7 +589,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, }, }, - "device-health (NEW SMD); skip health collection": { + "list-devices; with health (NEW SMD); skip health collection": { req: &ctlpb.SmdQueryReq{ OmitPools: true, Rank: uint32(ranklist.NilRank), @@ -628,7 +628,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, }, }, - "device-health; DAOS Failure": { + "list-devices; with health; DAOS failure": { req: &ctlpb.SmdQueryReq{ OmitPools: true, Rank: uint32(ranklist.NilRank), diff --git a/src/include/daos/tests_lib.h b/src/include/daos/tests_lib.h index b63bf5ef72c..10731ea0d20 100644 --- a/src/include/daos/tests_lib.h +++ b/src/include/daos/tests_lib.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2015-2023 Intel Corporation. + * (C) Copyright 2015-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -514,7 +514,7 @@ int dmg_storage_set_nvme_fault(const char *dmg_config_file, * Get NVMe Device health stats. * * \param[in] dmg_config_file DMG config file - * \param[in] host Get device-health from the given host. + * \param[in] host Get list-devices --health from the given host. * \param[in] uuid UUID of the device. * \param[in,out] stats [in] Health stats for which to get counter value. * [out] Stats counter value. diff --git a/src/tests/ftest/checksum/csum_error_logging.py b/src/tests/ftest/checksum/csum_error_logging.py index a512836a5ef..567f81e9702 100644 --- a/src/tests/ftest/checksum/csum_error_logging.py +++ b/src/tests/ftest/checksum/csum_error_logging.py @@ -23,16 +23,17 @@ class CsumErrorLog(DaosCoreBase): @fail_on(CommandFailure) def get_checksum_error_value(self, dmg, device_id): - """Get checksum error value from dmg storage_query_device_health. + """Get checksum error value from dmg storage_query_list_devices with health. Args: - dmg (DmgCommand): the DmgCommand object used to call storage_query_device_health() + dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices() device_id (str): Device UUID. Returns: int: the number of checksum errors on the device """ - info = get_dmg_smd_info(dmg.storage_query_device_health, 'devices', uuid=device_id) + info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id, + health=True) for devices in info.values(): for device in devices: try: @@ -40,7 +41,8 @@ def get_checksum_error_value(self, dmg, device_id): return device['ctrlr']['health_stats']['checksum_errs'] except KeyError as error: self.fail( - 'Error parsing dmg storage query device-health output: {}'.format(error)) + 'Error parsing dmg storage query list-devices --health output: {}'.format( + error)) return 0 @fail_on(CommandFailure) @@ -80,7 +82,8 @@ def test_csum_error_logging(self): if not device['uuid']: self.fail('Device uuid undefined') self.log_step( - 'Get checksum errors before running the test (dmg storage query device-health)') + 'Get checksum errors before running the test (dmg storage query list-devices ' + '--health)') check_sum = self.get_checksum_error_value(dmg, device['uuid']) dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) dmg.copy_configuration(self.hostlist_clients) @@ -89,7 +92,8 @@ def test_csum_error_logging(self): self.run_subtest() test_run = True self.log_step( - 'Get checksum errors after running the test (dmg storage query device-health)') + 'Get checksum errors after running the test (dmg storage query list-devices ' + '--health)') check_sum_latest = self.get_checksum_error_value(dmg, device['uuid']) self.log.info('Checksum Errors after: %d', check_sum_latest) self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented') diff --git a/src/tests/ftest/nvme/health.py b/src/tests/ftest/nvme/health.py index c363b349c9f..d23cb8427a4 100644 --- a/src/tests/ftest/nvme/health.py +++ b/src/tests/ftest/nvme/health.py @@ -26,7 +26,7 @@ def test_monitor_for_large_pools(self): Test Description: Test Health monitor for large number of pools. Use Case: This test creates many pools and verifies the following command behavior: dmg storage query list-pools - dmg storage query device-health + dmg storage query list-devices --health dmg storage scan --nvme-health :avocado: tags=all,full_regression @@ -126,7 +126,8 @@ def test_monitor_for_large_pools(self): for uuid in sorted(uuid_dict.keys()): dmg.hostlist = host try: - info = get_dmg_smd_info(dmg.storage_query_device_health, 'devices', uuid=uuid) + info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=uuid, + health=True) except CommandFailure as error: self.fail(str(error)) self.log.info('Verifying the health of devices on %s', host) @@ -145,12 +146,12 @@ def test_monitor_for_large_pools(self): device['ctrlr']['dev_state'], device['uuid'], error_msg) except KeyError as error: self.fail( - "Error parsing dmg.storage_query_device_health() output: {}".format( + "Error parsing dmg.storage_query_list_devices() output: {}".format( error)) if errors: self.fail( - 'Detected {} error(s) verifying dmg storage query device-health output'.format( - errors)) + 'Detected {} error(s) verifying dmg storage query list-devices --health output'. + format(errors)) # Get the nvme-health try: diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index c509fbf1541..f1e4d98ca50 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -357,21 +357,6 @@ def storage_replace_nvme(self, old_uuid, new_uuid, no_reint=False): ("storage", "replace", "nvme"), old_uuid=old_uuid, new_uuid=new_uuid, no_reint=no_reint) - def storage_query_device_health(self, uuid): - """Get the result of the 'dmg storage query device-health' command. - - Args: - uuid (str): Device UUID to query. - - Raises: - CommandFailure: if the dmg storage query device-health command fails. - - Returns: - dict: the dmg json command output converted to a python dictionary - - """ - return self._get_json_result(("storage", "query", "device-health"), uuid=uuid) - def storage_scan_nvme_health(self): """Get the result of the 'dmg storage scan --nvme-health' command. diff --git a/src/tests/ftest/util/dmg_utils_base.py b/src/tests/ftest/util/dmg_utils_base.py index c3e7958d534..43272eac510 100644 --- a/src/tests/ftest/util/dmg_utils_base.py +++ b/src/tests/ftest/util/dmg_utils_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -741,9 +741,7 @@ def __init__(self): def get_sub_command_class(self): # pylint: disable=redefined-variable-type """Get the dmg storage query sub command object.""" - if self.sub_command.value == "device-health": - self.sub_command_class = self.DeviceHealthSubCommand() - elif self.sub_command.value == "list-devices": + if self.sub_command.value == "list-devices": self.sub_command_class = self.ListDevicesSubCommand() elif self.sub_command.value == "list-pools": self.sub_command_class = self.ListPoolsSubCommand() @@ -752,14 +750,6 @@ def get_sub_command_class(self): else: self.sub_command_class = None - class DeviceHealthSubCommand(CommandWithParameters): - """Defines a dmg storage query device-health object.""" - - def __init__(self): - """Create a dmg storage query device-health object.""" - super().__init__("/run/dmg/storage/query/device-health/*", "device-health") - self.uuid = FormattedParameter("-u {}", None) - class ListDevicesSubCommand(CommandWithParameters): """Defines a dmg storage query list-devices object."""