diff --git a/go.mod b/go.mod index 71b5744..c265656 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,17 @@ module github.com/NVIDIA/go-dcgm -go 1.16 +go 1.21 require ( github.com/Masterminds/semver v1.5.0 - github.com/bits-and-blooms/bitset v1.2.1 - github.com/gorilla/mux v1.8.0 + github.com/bits-and-blooms/bitset v1.13.0 + github.com/gorilla/mux v1.8.1 github.com/stretchr/testify v1.8.4 ) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/objx v0.5.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum index 51d3295..76f0f6c 100644 --- a/go.sum +++ b/go.sum @@ -2,15 +2,20 @@ github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3Q github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY= github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= +github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= +github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= diff --git a/pkg/dcgm/api.go b/pkg/dcgm/api.go index 07f1774..6dcdf6b 100644 --- a/pkg/dcgm/api.go +++ b/pkg/dcgm/api.go @@ -1,6 +1,7 @@ package dcgm import ( + "context" "fmt" "os" "sync" @@ -103,9 +104,10 @@ func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { return healthCheckByGpuId(gpuId) } -// Policy sets GPU usage and error policies and notifies in case of any violations via callback functions -func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { - return registerPolicy(gpuId, typ...) +// ListenForPolicyViolations sets GPU usage and error policies and notifies in case of any violations +func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error) { + groupId := GroupAllGPUs() + return registerPolicy(ctx, groupId, typ...) } // Introspect returns DCGM hostengine memory and CPU usage diff --git a/pkg/dcgm/bcast.go b/pkg/dcgm/bcast.go index 03ac70b..ced0a63 100644 --- a/pkg/dcgm/bcast.go +++ b/pkg/dcgm/bcast.go @@ -80,5 +80,8 @@ func (p *publisher) broadcast() { } func (p *publisher) closePublisher() { + for _, s := range p.subscriberList() { + p.remove(s) + } p.close <- true } diff --git a/pkg/dcgm/policy.go b/pkg/dcgm/policy.go index 3539180..ec14da1 100644 --- a/pkg/dcgm/policy.go +++ b/pkg/dcgm/policy.go @@ -9,10 +9,10 @@ extern int violationNotify(void* p); */ import "C" import ( + "context" "encoding/binary" "fmt" "log" - "math/rand" "sync" "time" "unsafe" @@ -262,7 +262,7 @@ func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList for _, key := range paramList { conditionParam, exists := paramMap[policyIndex(key)] if !exists { - return fmt.Errorf("Error: Invalid Policy condition, %v does not exist.\n", key) + return fmt.Errorf("Error: Invalid Policy condition, %v does not exist", key) } // set policy condition parameters // set condition type (bool or longlong) @@ -287,23 +287,11 @@ func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList return } -func registerPolicy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { +func registerPolicy(ctx context.Context, groupId GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error) { // init policy globals for internal API makePolicyChannels() makePolicyParmsMap() - name := fmt.Sprintf("policy%d", rand.Uint64()) - groupId, err := CreateGroup(name) - if err != nil { - return nil, err - } - - if err = AddToGroup(groupId, gpuId); err != nil { - return nil, err - } - - // make a list of all callback channels - var channels []chan PolicyViolation // make a list of policy conditions for setting their parameters var paramKeys []policyIndex // get all conditions to be set in setPolicy() @@ -313,54 +301,67 @@ func registerPolicy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, case DbePolicy: paramKeys = append(paramKeys, dbePolicyIndex) condition |= C.DCGM_POLICY_COND_DBE - channels = append(channels, callbacks["dbe"]) case PCIePolicy: paramKeys = append(paramKeys, pciePolicyIndex) condition |= C.DCGM_POLICY_COND_PCI - channels = append(channels, callbacks["pcie"]) case MaxRtPgPolicy: paramKeys = append(paramKeys, maxRtPgPolicyIndex) condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED - channels = append(channels, callbacks["maxrtpg"]) case ThermalPolicy: paramKeys = append(paramKeys, thermalPolicyIndex) condition |= C.DCGM_POLICY_COND_THERMAL - channels = append(channels, callbacks["thermal"]) case PowerPolicy: paramKeys = append(paramKeys, powerPolicyIndex) condition |= C.DCGM_POLICY_COND_POWER - channels = append(channels, callbacks["power"]) case NvlinkPolicy: paramKeys = append(paramKeys, nvlinkPolicyIndex) condition |= C.DCGM_POLICY_COND_NVLINK - channels = append(channels, callbacks["nvlink"]) case XidPolicy: paramKeys = append(paramKeys, xidPolicyIndex) condition |= C.DCGM_POLICY_COND_XID - channels = append(channels, callbacks["xid"]) } } + var err error if err = setPolicy(groupId, condition, paramKeys); err != nil { return nil, err } - result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(C.violationNotify)) + var finishCallback unsafe.Pointer + result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(finishCallback)) if err = errorString(result); err != nil { return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} } log.Println("Listening for violations...") - // merge - violation := make(chan PolicyViolation, len(channels)) + violation := make(chan PolicyViolation, len(typ)) go func() { - for _, c := range channels { - val := <-c - violation <- val + defer func() { + log.Println("unregister policy violation...") + close(violation) + unregisterPolicy(groupId, condition) + }() + for { + select { + case dbe := <-callbacks["dbe"]: + violation <- dbe + case pcie := <-callbacks["pcie"]: + violation <- pcie + case maxrtpg := <-callbacks["maxrtpg"]: + violation <- maxrtpg + case thermal := <-callbacks["thermal"]: + violation <- thermal + case power := <-callbacks["power"]: + violation <- power + case nvlink := <-callbacks["nvlink"]: + violation <- nvlink + case xid := <-callbacks["xid"]: + violation <- xid + case <-ctx.Done(): + return + } } - DestroyGroup(groupId) - close(violation) }() return violation, err @@ -370,7 +371,7 @@ func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition) if err := errorString(result); err != nil { - fmt.Errorf("Error unregistering policy: %s", err) + log.Println(fmt.Errorf("error unregistering policy: %s", err)) } } diff --git a/pkg/dcgm/policy_test.go b/pkg/dcgm/policy_test.go index d905462..52416f3 100644 --- a/pkg/dcgm/policy_test.go +++ b/pkg/dcgm/policy_test.go @@ -1,3 +1,5 @@ +//go:build linux && cgo + /* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * @@ -17,6 +19,10 @@ package dcgm import ( + "context" + "log" + "math/rand" + "strings" "testing" "time" @@ -26,14 +32,17 @@ import ( func TestPolicyErrors(t *testing.T) { type testCase struct { - policy policyCondition - injectError func(gpu uint) error - assert func(cb PolicyViolation) + policy []policyCondition + numErrors int + injectError func() error + assert func(cb PolicyViolation, en int) } tests := []testCase{ { - policy: DbePolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{DbePolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_ECC_DBE_VOL_DEV", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_ECC_DBE_VOL_DEV, @@ -43,7 +52,7 @@ func TestPolicyErrors(t *testing.T) { int64(1), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, DbePolicy, cb.Condition) require.IsType(t, dbePolicyCondition{}, cb.Data) @@ -53,8 +62,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: PowerPolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{PowerPolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_POWER_USAGE", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_POWER_USAGE, @@ -64,7 +75,7 @@ func TestPolicyErrors(t *testing.T) { float64(300.0), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, PowerPolicy, cb.Condition) require.IsType(t, powerPolicyCondition{}, cb.Data) @@ -73,8 +84,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: PCIePolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{PCIePolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_POWER_USAGE", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_PCIE_REPLAY_COUNTER, @@ -84,7 +97,7 @@ func TestPolicyErrors(t *testing.T) { int64(1), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, PCIePolicy, cb.Condition) require.IsType(t, pciPolicyCondition{}, cb.Data) @@ -93,8 +106,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: MaxRtPgPolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{MaxRtPgPolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_RETIRED_DBE", gpu) err := InjectFieldValue(gpu, DCGM_FI_DEV_RETIRED_DBE, @@ -116,7 +131,7 @@ func TestPolicyErrors(t *testing.T) { } return err }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, MaxRtPgPolicy, cb.Condition) require.IsType(t, retiredPagesPolicyCondition{}, cb.Data) @@ -125,8 +140,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: ThermalPolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{ThermalPolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_GPU_TEMP", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_GPU_TEMP, @@ -136,7 +153,7 @@ func TestPolicyErrors(t *testing.T) { int64(101), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, ThermalPolicy, cb.Condition) require.IsType(t, thermalPolicyCondition{}, cb.Data) @@ -145,8 +162,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: NvlinkPolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{NvlinkPolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, @@ -156,7 +175,7 @@ func TestPolicyErrors(t *testing.T) { int64(1), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, NvlinkPolicy, cb.Condition) require.IsType(t, nvlinkPolicyCondition{}, cb.Data) @@ -165,8 +184,10 @@ func TestPolicyErrors(t *testing.T) { }, }, { - policy: XidPolicy, - injectError: func(gpu uint) error { + policy: []policyCondition{XidPolicy}, + numErrors: 1, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) return InjectFieldValue(gpu, DCGM_FI_DEV_XID_ERRORS, @@ -176,7 +197,7 @@ func TestPolicyErrors(t *testing.T) { int64(16), ) }, - assert: func(cb PolicyViolation) { + assert: func(cb PolicyViolation, _ int) { require.NotNil(t, cb) assert.Equal(t, XidPolicy, cb.Condition) require.IsType(t, xidPolicyCondition{}, cb.Data) @@ -184,13 +205,70 @@ func TestPolicyErrors(t *testing.T) { assert.Equal(t, uint(16), xidPolicyCondition.ErrNum) }, }, + { + //testcase: register multiple policy conditions + policy: []policyCondition{NvlinkPolicy, XidPolicy}, + numErrors: 2, + injectError: func() error { + gpu := uint(rand.Intn(8) + 1) + //Inject a DBE error; since it has not registered DBEPolicy it will not get this event. + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_ECC_DBE_VOL_DEV", gpu) + err := InjectFieldValue(gpu, + DCGM_FI_DEV_ECC_DBE_VOL_DEV, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(1), + ) + gpu = uint(rand.Intn(8) + 1) + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) + err = InjectFieldValue(gpu, + DCGM_FI_DEV_XID_ERRORS, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(16), + ) + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", gpu) + err = InjectFieldValue(gpu, + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(1), + ) + return err + }, + assert: func(cb PolicyViolation, en int) { + switch en { + case 1: + require.NotNil(t, cb) + assert.Equal(t, XidPolicy, cb.Condition) + require.IsType(t, xidPolicyCondition{}, cb.Data) + xidPolicyCondition := cb.Data.(xidPolicyCondition) + assert.Equal(t, uint(16), xidPolicyCondition.ErrNum) + case 2: + require.NotNil(t, cb) + assert.Equal(t, NvlinkPolicy, cb.Condition) + require.IsType(t, nvlinkPolicyCondition{}, cb.Data) + nvlinkPolicyCondition := cb.Data.(nvlinkPolicyCondition) + assert.Equal(t, uint(1), nvlinkPolicyCondition.Counter) + } + }, + }, } for _, tc := range tests { - t.Run(string(tc.policy), func(t *testing.T) { + t.Run(joinPolicy(tc.policy, "|"), func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) cleanup, err := Init(Embedded) require.NoError(t, err) - defer cleanup() + defer func() { + log.Printf("Cleaning up %s \n", t.Name()) + cleanup() + cancel() + time.Sleep(100 * time.Millisecond) + }() numGPUs, err := GetAllDeviceCount() require.NoError(t, err) @@ -200,30 +278,47 @@ func TestPolicyErrors(t *testing.T) { } entityList := []MigHierarchyInfo{ - { - Entity: GroupEntityPair{EntityGroupId: FE_GPU}, - }, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, + {Entity: GroupEntityPair{EntityGroupId: FE_GPU}}, } - gpuIDs, err := CreateFakeEntities(entityList) + _, err = CreateFakeEntities(entityList) require.NoError(t, err) - gpu := gpuIDs[0] - - callback, err := Policy(gpu, tc.policy) + callback, err := ListenForPolicyViolations(ctx, tc.policy...) require.NoError(t, err) - err = tc.injectError(gpu) + err = tc.injectError() require.NoError(t, err) - + numCb := 0 select { case callbackData := <-callback: require.NotNil(t, callbackData) - tc.assert(callbackData) - break + numCb++ + tc.assert(callbackData, numCb) + if numCb == tc.numErrors { + break + } case <-time.After(20 * time.Second): require.Fail(t, "policy callback never happened") } }) } } + +func joinPolicy(policy []policyCondition, sep string) string { + var result strings.Builder + for i, v := range policy { + if i > 0 { + result.WriteString(sep) + } + result.WriteString(string(v)) + } + return result.String() +} diff --git a/samples/policy/main.go b/samples/policy/main.go index ef2bc76..82c8c9f 100644 --- a/samples/policy/main.go +++ b/samples/policy/main.go @@ -1,26 +1,35 @@ package main import ( - "fmt" + "context" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "log" + "os" + "os/signal" + "syscall" ) // dcgmi group -c "name" --default // dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10 // dcgmi policy -g GROUPID --reg func main() { + ctx, done := context.WithCancel(context.Background()) + // Handle SIGINT (Ctrl+C) and SIGTERM (termination signal) + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + go func() { + <-sigs + log.Println("Received termination signal, exiting...") + done() + }() + cleanup, err := dcgm.Init(dcgm.Embedded) if err != nil { log.Panicln(err) } defer cleanup() - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - log.Panicln(err) - } - // Choose policy conditions to register violation callback. // Note: Need to be root for some options // Available options are: @@ -31,14 +40,18 @@ func main() { // 5. dcgm.PowerPolicy // 6. dcgm.NvlinkPolicy // 7. dcgm.XidPolicy - for _, gpu := range gpus { - c, err := dcgm.Policy(gpu, dcgm.XidPolicy) - if err != nil { - log.Panicln(err) - } + c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy) + if err != nil { + log.Panicln(err) + } - pe := <-c - fmt.Printf("GPU %8s %v\nError %6s %v\nTimestamp %2s %v\nData %7s %v\n", - ":", gpu, ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data) + for { + select { + case pe := <-c: + log.Printf("PolicyViolation %6s %v\nTimestamp %2s %v\nData %7s %v", + ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data) + case <-ctx.Done(): + return + } } } diff --git a/tests/dcgm_test.go b/tests/dcgm_test.go index 1b639e5..02b635e 100644 --- a/tests/dcgm_test.go +++ b/tests/dcgm_test.go @@ -98,6 +98,9 @@ func TestDeviceInfo(t *testing.T) { for _, val := range fields { var msg, output string res := Query(id, val) + if res == "[N/A]" { + continue + } switch val { case "driver_version": @@ -180,6 +183,9 @@ func TestDeviceStatus(t *testing.T) { for _, val := range fields { var msg, output string res := Query(id, val) + if res == "[N/A]" { + continue + } switch val { case "power.draw":