Skip to content

Commit

Permalink
DAOS-15966 control: Remove dmg storage query device-health (#14508)
Browse files Browse the repository at this point in the history
Remove unnecessary command alias in favor of dmg storage query
list-devices --health.

Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr authored Jun 11, 2024
1 parent 475e0b9 commit b2a53b1
Show file tree
Hide file tree
Showing 13 changed files with 50 additions and 99 deletions.
6 changes: 3 additions & 3 deletions src/bio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ The device owner xstream is responsible for maintaining anf updating all device
The DAOS data plane will monitor NVMe SSDs every 60 seconds, including updating the health stats with current values, checking current device states, and making any necessary blobstore/device state transitions. Once a FAULTY state transition has occurred, the monitoring period will be reduced to 10 seconds to allow for quicker transitions and finer-grained monitoring until the device is fully evicted.

Useful admin command to query device health:
- <a href="#81">dmg storage query device-health</a> [used to query SSD health stats]
- <a href="#81">dmg storage query list-devices --health</a> [used to query SSD health stats]

While monitoring this health data, an admin can now make the determination to manually evict a faulty device. This data will also be used to set the faulty device criteria for automatic SSD eviction (available in a future release).

Expand Down Expand Up @@ -179,10 +179,10 @@ Pools
```

<a id="81"></a>
- Query Device Health Data: **$dmg storage query device-health**
- Query Device Health Data: **$dmg storage query list-devices --health**

```
$ dmg storage query device-health --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
$ dmg storage query list-devices --health --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
Devices:
UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8d:00.0]
Targets:[0] Rank:0 State:NORMAL
Expand Down
4 changes: 2 additions & 2 deletions src/common/tests_dmg_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1439,8 +1439,8 @@ dmg_storage_query_device_health(const char *dmg_config_file, char *host,
if (args == NULL)
D_GOTO(out, rc = -DER_NOMEM);

rc = daos_dmg_json_pipe("storage query device-health ", dmg_config_file,
args, argcount, &dmg_out);
rc = daos_dmg_json_pipe("storage query list-devices --health ", dmg_config_file, args,
argcount, &dmg_out);
if (rc != 0) {
D_ERROR("dmg command failed\n");
goto out_json;
Expand Down
4 changes: 1 addition & 3 deletions src/control/cmd/dmg/json_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -75,8 +75,6 @@ func TestDmg_JsonOutput(t *testing.T) {
case "storage nvme-add-device":
testArgs = append(testArgs, "-l", "foo.com", "-a",
test.MockPCIAddr(), "-e", "0")
case "storage query device-health":
testArgs = append(testArgs, "-u", test.MockUUID())
case "storage set nvme-faulty":
testArgs = append(testArgs, "--force", "-u", test.MockUUID())
case "storage replace nvme":
Expand Down
2 changes: 1 addition & 1 deletion src/control/cmd/dmg/pretty/storage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1585,7 +1585,7 @@ host1
No devices found
`,
},
"device-health": {
"list-devices; with health": {
noPools: true,
hsm: mockHostStorageMap(t,
&mockHostStorage{
Expand Down
25 changes: 4 additions & 21 deletions src/control/cmd/dmg/storage_query.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2023 Intel Corporation.
// (C) Copyright 2019-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -70,26 +70,9 @@ func (cmd *smdQueryCmd) makeRequest(ctx context.Context, req *control.SmdQueryRe

// storageQueryCmd is the struct representing the storage query subcommand
type storageQueryCmd struct {
DeviceHealth devHealthQueryCmd `command:"device-health" description:"Query the device health"`
ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools with NVMe on the server"`
ListDevices listDevicesQueryCmd `command:"list-devices" description:"List storage devices on the server"`
Usage usageQueryCmd `command:"usage" description:"Show SCM & NVMe storage space utilization per storage server"`
}

type devHealthQueryCmd struct {
smdQueryCmd
UUID string `short:"u" long:"uuid" description:"Device UUID. All devices queried if arg not set"`
}

func (cmd *devHealthQueryCmd) Execute(_ []string) error {
ctx := cmd.MustLogCtx()
req := &control.SmdQueryReq{
OmitPools: true,
IncludeBioHealth: true,
Rank: ranklist.NilRank,
UUID: cmd.UUID,
}
return cmd.makeRequest(ctx, req)
ListPools listPoolsQueryCmd `command:"list-pools" description:"List pools with NVMe on the server"`
ListDevices listDevicesQueryCmd `command:"list-devices" description:"List storage devices on the server"`
Usage usageQueryCmd `command:"usage" description:"Show SCM & NVMe storage space utilization per storage server"`
}

type listDevicesQueryCmd struct {
Expand Down
36 changes: 13 additions & 23 deletions src/control/cmd/dmg/storage_query_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2023 Intel Corporation.
// (C) Copyright 2019-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -18,27 +18,6 @@ import (

func TestStorageQueryCommands(t *testing.T) {
runCmdTests(t, []cmdTest{
{
"per-server metadata device health query",
"storage query device-health --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d",
printRequest(t, &control.SmdQueryReq{
Rank: ranklist.NilRank,
OmitPools: true,
IncludeBioHealth: true,
UUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d",
}),
nil,
},
{
"per-server metadata device health query (missing uuid)",
"storage query device-health",
printRequest(t, &control.SmdQueryReq{
Rank: ranklist.NilRank,
OmitPools: true,
IncludeBioHealth: true,
}),
nil,
},
{
"per-server metadata query pools",
"storage query list-pools",
Expand Down Expand Up @@ -77,7 +56,7 @@ func TestStorageQueryCommands(t *testing.T) {
nil,
},
{
"per-server metadata query devices (include health)",
"per-server metadata device query health",
"storage query list-devices --health",
printRequest(t, &control.SmdQueryReq{
Rank: ranklist.NilRank,
Expand All @@ -86,6 +65,17 @@ func TestStorageQueryCommands(t *testing.T) {
}),
nil,
},
{
"per-server metadata device query health (by uuid)",
"storage query list-devices --health --uuid 842c739b-86b5-462f-a7ba-b4a91b674f3d",
printRequest(t, &control.SmdQueryReq{
Rank: ranklist.NilRank,
OmitPools: true,
IncludeBioHealth: true,
UUID: "842c739b-86b5-462f-a7ba-b4a91b674f3d",
}),
nil,
},
{
"per-server metadata query devices (show only evicted)",
"storage query list-devices --show-evicted",
Expand Down
2 changes: 1 addition & 1 deletion src/control/lib/support/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ const (
)

const DmgListDeviceCmd = "dmg storage query list-devices"
const DmgDeviceHealthCmd = "dmg storage query device-health"
const DmgDeviceHealthCmd = "dmg storage query list-devices --health"

var DmgCmd = []string{
"dmg system get-prop",
Expand Down
10 changes: 5 additions & 5 deletions src/control/server/ctl_smd_rpc_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -485,7 +485,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
expErr: daos.Busy,
},
"device-health": {
"list-devices; with health": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Expand Down Expand Up @@ -531,7 +531,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
},
},
"device-health; no uuid in request": {
"list-devices; with health; no uuid in request": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Expand Down Expand Up @@ -589,7 +589,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
},
},
"device-health (NEW SMD); skip health collection": {
"list-devices; with health (NEW SMD); skip health collection": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Expand Down Expand Up @@ -628,7 +628,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
},
},
"device-health; DAOS Failure": {
"list-devices; with health; DAOS failure": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Expand Down
4 changes: 2 additions & 2 deletions src/include/daos/tests_lib.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2015-2023 Intel Corporation.
* (C) Copyright 2015-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -514,7 +514,7 @@ int dmg_storage_set_nvme_fault(const char *dmg_config_file,
* Get NVMe Device health stats.
*
* \param[in] dmg_config_file DMG config file
* \param[in] host Get device-health from the given host.
* \param[in] host Get list-devices --health from the given host.
* \param[in] uuid UUID of the device.
* \param[in,out] stats [in] Health stats for which to get counter value.
* [out] Stats counter value.
Expand Down
16 changes: 10 additions & 6 deletions src/tests/ftest/checksum/csum_error_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@ class CsumErrorLog(DaosCoreBase):

@fail_on(CommandFailure)
def get_checksum_error_value(self, dmg, device_id):
"""Get checksum error value from dmg storage_query_device_health.
"""Get checksum error value from dmg storage_query_list_devices with health.
Args:
dmg (DmgCommand): the DmgCommand object used to call storage_query_device_health()
dmg (DmgCommand): the DmgCommand object used to call storage_query_list_devices()
device_id (str): Device UUID.
Returns:
int: the number of checksum errors on the device
"""
info = get_dmg_smd_info(dmg.storage_query_device_health, 'devices', uuid=device_id)
info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=device_id,
health=True)
for devices in info.values():
for device in devices:
try:
if device['uuid'] == device_id:
return device['ctrlr']['health_stats']['checksum_errs']
except KeyError as error:
self.fail(
'Error parsing dmg storage query device-health output: {}'.format(error))
'Error parsing dmg storage query list-devices --health output: {}'.format(
error))
return 0

@fail_on(CommandFailure)
Expand Down Expand Up @@ -80,7 +82,8 @@ def test_csum_error_logging(self):
if not device['uuid']:
self.fail('Device uuid undefined')
self.log_step(
'Get checksum errors before running the test (dmg storage query device-health)')
'Get checksum errors before running the test (dmg storage query list-devices '
'--health)')
check_sum = self.get_checksum_error_value(dmg, device['uuid'])
dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients)
dmg.copy_configuration(self.hostlist_clients)
Expand All @@ -89,7 +92,8 @@ def test_csum_error_logging(self):
self.run_subtest()
test_run = True
self.log_step(
'Get checksum errors after running the test (dmg storage query device-health)')
'Get checksum errors after running the test (dmg storage query list-devices '
'--health)')
check_sum_latest = self.get_checksum_error_value(dmg, device['uuid'])
self.log.info('Checksum Errors after: %d', check_sum_latest)
self.assertTrue(check_sum_latest > check_sum, 'Checksum Error Log not incremented')
Expand Down
11 changes: 6 additions & 5 deletions src/tests/ftest/nvme/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_monitor_for_large_pools(self):
Test Description: Test Health monitor for large number of pools.
Use Case: This test creates many pools and verifies the following command behavior:
dmg storage query list-pools
dmg storage query device-health
dmg storage query list-devices --health
dmg storage scan --nvme-health
:avocado: tags=all,full_regression
Expand Down Expand Up @@ -126,7 +126,8 @@ def test_monitor_for_large_pools(self):
for uuid in sorted(uuid_dict.keys()):
dmg.hostlist = host
try:
info = get_dmg_smd_info(dmg.storage_query_device_health, 'devices', uuid=uuid)
info = get_dmg_smd_info(dmg.storage_query_list_devices, 'devices', uuid=uuid,
health=True)
except CommandFailure as error:
self.fail(str(error))
self.log.info('Verifying the health of devices on %s', host)
Expand All @@ -145,12 +146,12 @@ def test_monitor_for_large_pools(self):
device['ctrlr']['dev_state'], device['uuid'], error_msg)
except KeyError as error:
self.fail(
"Error parsing dmg.storage_query_device_health() output: {}".format(
"Error parsing dmg.storage_query_list_devices() output: {}".format(
error))
if errors:
self.fail(
'Detected {} error(s) verifying dmg storage query device-health output'.format(
errors))
'Detected {} error(s) verifying dmg storage query list-devices --health output'.
format(errors))

# Get the nvme-health
try:
Expand Down
15 changes: 0 additions & 15 deletions src/tests/ftest/util/dmg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,21 +357,6 @@ def storage_replace_nvme(self, old_uuid, new_uuid, no_reint=False):
("storage", "replace", "nvme"), old_uuid=old_uuid,
new_uuid=new_uuid, no_reint=no_reint)

def storage_query_device_health(self, uuid):
"""Get the result of the 'dmg storage query device-health' command.
Args:
uuid (str): Device UUID to query.
Raises:
CommandFailure: if the dmg storage query device-health command fails.
Returns:
dict: the dmg json command output converted to a python dictionary
"""
return self._get_json_result(("storage", "query", "device-health"), uuid=uuid)

def storage_scan_nvme_health(self):
"""Get the result of the 'dmg storage scan --nvme-health' command.
Expand Down
14 changes: 2 additions & 12 deletions src/tests/ftest/util/dmg_utils_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
(C) Copyright 2020-2023 Intel Corporation.
(C) Copyright 2020-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -741,9 +741,7 @@ def __init__(self):
def get_sub_command_class(self):
# pylint: disable=redefined-variable-type
"""Get the dmg storage query sub command object."""
if self.sub_command.value == "device-health":
self.sub_command_class = self.DeviceHealthSubCommand()
elif self.sub_command.value == "list-devices":
if self.sub_command.value == "list-devices":
self.sub_command_class = self.ListDevicesSubCommand()
elif self.sub_command.value == "list-pools":
self.sub_command_class = self.ListPoolsSubCommand()
Expand All @@ -752,14 +750,6 @@ def get_sub_command_class(self):
else:
self.sub_command_class = None

class DeviceHealthSubCommand(CommandWithParameters):
"""Defines a dmg storage query device-health object."""

def __init__(self):
"""Create a dmg storage query device-health object."""
super().__init__("/run/dmg/storage/query/device-health/*", "device-health")
self.uuid = FormattedParameter("-u {}", None)

class ListDevicesSubCommand(CommandWithParameters):
"""Defines a dmg storage query list-devices object."""

Expand Down

0 comments on commit b2a53b1

Please sign in to comment.