Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NO-ISSUE: Add NVIDIA GPU operator only if there are NVIDIA GPUs #7218

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ func main() {
manifestsGenerator := network.NewManifestsGenerator(manifestsApi, Options.ManifestsGeneratorConfig, db)
clusterApi := cluster.NewManager(Options.ClusterConfig, log.WithField("pkg", "cluster-state"), db,
notificationStream, eventsHandler, uploadClient, hostApi, metricsManager, manifestsGenerator, lead, operatorsManager,
ocmClient, objectHandler, dnsApi, authHandler, manifestsApi, Options.EnableSoftTimeouts)
ocmClient, objectHandler, dnsApi, authHandler, manifestsApi, Options.EnableSoftTimeouts, usageManager)
infraEnvApi := infraenv.NewManager(log.WithField("pkg", "host-state"), db, objectHandler)

clusterEventsUploader := thread.New(
Expand Down
81 changes: 52 additions & 29 deletions internal/bminventory/inventory_test.go

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions internal/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/openshift/assisted-service/internal/operators"
"github.com/openshift/assisted-service/internal/stream"
"github.com/openshift/assisted-service/internal/uploader"
"github.com/openshift/assisted-service/internal/usage"
"github.com/openshift/assisted-service/models"
"github.com/openshift/assisted-service/pkg/auth"
"github.com/openshift/assisted-service/pkg/commonutils"
Expand Down Expand Up @@ -176,7 +177,8 @@ type Manager struct {
func NewManager(cfg Config, log logrus.FieldLogger, db *gorm.DB, stream stream.Notifier, eventsHandler eventsapi.Handler,
uploadClient uploader.Client, hostAPI host.API, metricApi metrics.API, manifestsGeneratorAPI network.ManifestsGeneratorAPI,
leaderElector leader.Leader, operatorsApi operators.API, ocmClient *ocm.Client, objectHandler s3wrapper.API,
dnsApi dns.DNSApi, authHandler auth.Authenticator, manifestApi manifestsapi.ManifestsAPI, softTimeoutsEnabled bool) *Manager {
dnsApi dns.DNSApi, authHandler auth.Authenticator, manifestApi manifestsapi.ManifestsAPI, softTimeoutsEnabled bool,
usageApi usage.API) *Manager {
th := &transitionHandler{
log: log,
db: db,
Expand All @@ -198,7 +200,7 @@ func NewManager(cfg Config, log logrus.FieldLogger, db *gorm.DB, stream stream.N
sm: NewClusterStateMachine(th),
metricAPI: metricApi,
manifestsGeneratorAPI: manifestsGeneratorAPI,
rp: newRefreshPreprocessor(log, hostAPI, operatorsApi),
rp: newRefreshPreprocessor(log, hostAPI, operatorsApi, usageApi),
hostAPI: hostAPI,
leaderElector: leaderElector,
prevMonitorInvokedAt: time.Now(),
Expand Down
113 changes: 81 additions & 32 deletions internal/cluster/cluster_test.go

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion internal/cluster/progress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ var _ = Describe("Progress bar test", func() {
mockOperatorApi = operators.NewMockAPI(ctrl)
mockDnsApi = dns.NewMockDNSApi(ctrl)
clusterApi = NewManager(getDefaultConfig(), common.GetTestLog().WithField("pkg", "cluster-monitor"), db, commontesting.GetDummyNotificationStream(ctrl),
mockEvents, nil, mockHostAPI, mockMetric, nil, nil, mockOperatorApi, nil, nil, mockDnsApi, nil, nil, false)
mockEvents, nil, mockHostAPI, mockMetric, nil, nil, mockOperatorApi, nil, nil, mockDnsApi, nil, nil, false, nil)

mockOperatorApi.EXPECT().ResolveDependencies(gomock.Any(), gomock.Any()).DoAndReturn(
func(_ *common.Cluster, previousOperators []*models.MonitoredOperator) ([]*models.MonitoredOperator, error) {
return previousOperators, nil
},
).AnyTimes()
})

AfterEach(func() {
Expand Down
116 changes: 115 additions & 1 deletion internal/cluster/refresh_status_preprocessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package cluster

import (
"context"
"encoding/json"
"fmt"
"reflect"
"sort"
"strings"

Expand All @@ -11,6 +13,8 @@ import (
"github.com/openshift/assisted-service/internal/host"
"github.com/openshift/assisted-service/internal/operators"
"github.com/openshift/assisted-service/internal/operators/api"
operatorcommon "github.com/openshift/assisted-service/internal/operators/common"
"github.com/openshift/assisted-service/internal/usage"
"github.com/openshift/assisted-service/models"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand All @@ -34,9 +38,10 @@ type refreshPreprocessor struct {
validations []validation
conditions []condition
operatorsAPI operators.API
usageAPI usage.API
}

func newRefreshPreprocessor(log logrus.FieldLogger, hostAPI host.API, operatorsAPI operators.API) *refreshPreprocessor {
func newRefreshPreprocessor(log logrus.FieldLogger, hostAPI host.API, operatorsAPI operators.API, usageAPI usage.API) *refreshPreprocessor {
v := clusterValidator{
log: log,
hostAPI: hostAPI,
Expand All @@ -47,6 +52,7 @@ func newRefreshPreprocessor(log logrus.FieldLogger, hostAPI host.API, operatorsA
validations: newValidations(&v),
conditions: newConditions(&v),
operatorsAPI: operatorsAPI,
usageAPI: usageAPI,
}
}

Expand Down Expand Up @@ -84,6 +90,17 @@ func (r *refreshPreprocessor) preprocess(ctx context.Context, c *clusterPreproce
Message: message,
})
}

// Before validating the operators we need to recalculate the dependencies because changes in the hosts may
// imply changes in the dependencies between operators. For example, if the OpenShift AI operator is enabled and
// a new host with an NVIDIA GPU has been discovered, then the NVIDIA GPU operator will need to be added as a
// dependency, and then we will need to validate that secure boot is disabled.
err = r.recalculateOperatorDependencies(c)
if err != nil {
err = errors.Wrapf(err, "failed to recalculate operator dependencies for cluster '%s'", c.clusterId)
return nil, nil, err
}

// Validate operators
results, err := r.operatorsAPI.ValidateCluster(ctx, c.cluster)
if err != nil {
Expand Down Expand Up @@ -124,6 +141,103 @@ func (r *refreshPreprocessor) preprocess(ctx context.Context, c *clusterPreproce
return stateMachineInput, validationsOutput, nil
}

// recalculateOperatorDependencies calculates the operator dependencies and updates the database and the passed cluster
// accordingly.
func (r *refreshPreprocessor) recalculateOperatorDependencies(c *clusterPreprocessContext) error {
// Calculate and save the operators that have been added, updated or deleted:
previousOperators := c.cluster.MonitoredOperators
currentOperators, err := r.operatorsAPI.ResolveDependencies(c.cluster, c.cluster.MonitoredOperators)
if err != nil {
return errors.Wrapf(
err,
"failed to resolve operator dependencies for cluster '%s'",
c.clusterId,
)
}
var addedOperators, updatedOperators, deletedOperators []*models.MonitoredOperator
for _, currentOperator := range currentOperators {
if currentOperator.ClusterID == "" {
currentOperator.ClusterID = c.clusterId
}
previousOperator := operatorcommon.GetOperator(previousOperators, currentOperator.Name)
if previousOperator != nil {
if !reflect.DeepEqual(currentOperator, previousOperator) {
updatedOperators = append(updatedOperators, currentOperator)
}
} else {
addedOperators = append(addedOperators, currentOperator)
}
}
for _, previousOperator := range previousOperators {
if !operatorcommon.HasOperator(currentOperators, previousOperator.Name) {
deletedOperators = append(deletedOperators, previousOperator)
}
}
for _, addedOperator := range addedOperators {
err = c.db.Save(addedOperator).Error
if err != nil {
return errors.Wrapf(
err,
"failed to add operator '%s' to cluster '%s'",
addedOperator.Name, *c.cluster.ID,
)
}
}
for _, updatedOperator := range updatedOperators {
err = c.db.Save(updatedOperator).Error
if err != nil {
return errors.Wrapf(
err,
"failed to update operator '%s' for cluster '%s'",
updatedOperator.Name, *c.cluster.ID,
)
}
}
for _, deletedOperator := range deletedOperators {
err = c.db.Delete(deletedOperator).Error
if err != nil {
return errors.Wrapf(
err,
"failed to delete operator '%s' from cluster '%s'",
deletedOperator.Name,
c.clusterId,
)
}
}
c.cluster.MonitoredOperators = currentOperators

// If any operator has been added or deleted then we need to update the corresponding feature usage:
if r.usageAPI != nil && (len(addedOperators) > 0 || len(deletedOperators) > 0) {
var usages usage.FeatureUsage
usages, err = usage.Unmarshal(c.cluster.FeatureUsage)
if err != nil {
return errors.Wrapf(
err,
"failed to read feature usage from cluster '%s'",
c.clusterId,
)
}
for _, addedOperator := range addedOperators {
r.usageAPI.Add(usages, strings.ToUpper(addedOperator.Name), nil)
}
for _, deletedOperator := range deletedOperators {
r.usageAPI.Remove(usages, strings.ToUpper(deletedOperator.Name))
}
data, err := json.Marshal(usages)
if err != nil {
return errors.Wrapf(
err,
"failed to write feature usage to cluster '%s'",
c.clusterId,
)
}
c.cluster.FeatureUsage = string(data)
r.usageAPI.Save(c.db, c.clusterId, usages)
}

return nil
}

// sortByValidationResultID sorts results by models.ClusterValidationID
func sortByValidationResultID(validationResults []ValidationResult) {
sort.SliceStable(validationResults, func(i, j int) bool {
Expand Down
79 changes: 78 additions & 1 deletion internal/cluster/refresh_status_preprocessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cluster

import (
"context"
"strings"

"github.com/go-openapi/strfmt"
"github.com/golang/mock/gomock"
Expand All @@ -12,6 +13,7 @@ import (
"github.com/openshift/assisted-service/internal/host"
"github.com/openshift/assisted-service/internal/host/hostutil"
"github.com/openshift/assisted-service/internal/operators"
"github.com/openshift/assisted-service/internal/usage"
"github.com/openshift/assisted-service/models"
"github.com/sirupsen/logrus"
"github.com/thoas/go-funk"
Expand All @@ -25,6 +27,7 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
ctrl *gomock.Controller
mockHostApi *host.MockAPI
mockOperatorManager *operators.MockAPI
mockUsageApi *usage.MockAPI
db *gorm.DB
dbName string
cluster *common.Cluster
Expand All @@ -37,16 +40,18 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
ctrl = gomock.NewController(GinkgoT())
mockHostApi = host.NewMockAPI(ctrl)
mockOperatorManager = operators.NewMockAPI(ctrl)
mockOperatorManager.EXPECT().ValidateCluster(ctx, gomock.Any())
mockUsageApi = usage.NewMockAPI(ctrl)
db, dbName = common.PrepareTestDB()
preprocessor = newRefreshPreprocessor(
logrus.New(),
mockHostApi,
mockOperatorManager,
mockUsageApi,
)
})

AfterEach(func() {
ctrl.Finish()
common.DeleteTestDB(db, dbName)
})

Expand Down Expand Up @@ -87,6 +92,31 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
}
}

mockNoChangeInOperatorDependencies := func() {
mockOperatorManager.EXPECT().ResolveDependencies(gomock.Any(), gomock.Any()).DoAndReturn(
func(_ *common.Cluster, previousOperators []*models.MonitoredOperator) ([]*models.MonitoredOperator, error) {
return previousOperators, nil
},
).AnyTimes()
}

mockAddedOperatorDependencies := func(addedOperators ...*models.MonitoredOperator) {
mockOperatorManager.EXPECT().ResolveDependencies(gomock.Any(), gomock.Any()).DoAndReturn(
func(_ *common.Cluster, previousOperators []*models.MonitoredOperator) ([]*models.MonitoredOperator, error) {
currentOperators := append(previousOperators, addedOperators...)
return currentOperators, nil
},
).Times(1)
for _, addedOperator := range addedOperators {
mockUsageApi.EXPECT().Add(gomock.Any(), strings.ToUpper(addedOperator.Name), gomock.Any()).Times(1)
}
mockUsageApi.EXPECT().Save(gomock.Any(), gomock.Any(), gomock.Any()).Times(1)
}

mockOperatorValidationsSuccess := func() {
mockOperatorManager.EXPECT().ValidateCluster(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
}

Context("Skipping Validations", func() {

cantBeIgnored := common.NonIgnorableClusterValidations
Expand Down Expand Up @@ -122,6 +152,8 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
})

It("Should allow permitted ignorable validations to be ignored", func() {
mockNoChangeInOperatorDependencies()
mockOperatorValidationsSuccess()
validationContext.cluster.IgnoredClusterValidations = "[\"network-type-valid\", \"ingress-vips-valid\", \"ingress-vips-defined\"]"
conditions, _, _ := preprocessor.preprocess(ctx, validationContext)
Expect(conditions).ToNot(BeEmpty())
Expand All @@ -136,6 +168,8 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
})

It("Should allow all permitted ignorable validations to be ignored", func() {
mockNoChangeInOperatorDependencies()
mockOperatorValidationsSuccess()
validationContext.cluster.IgnoredClusterValidations = "[\"all\"]"
conditions, _, _ := preprocessor.preprocess(ctx, validationContext)
Expect(conditions).ToNot(BeEmpty())
Expand All @@ -148,6 +182,8 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
})

It("Should never allow a specific mandatory validation to be ignored", func() {
mockNoChangeInOperatorDependencies()
mockOperatorValidationsSuccess()
validationContext.cluster.IgnoredClusterValidations = "[\"all\"]"
conditions, _, _ := preprocessor.preprocess(ctx, validationContext)
for _, unskippableHostValidation := range cantBeIgnored {
Expand All @@ -157,4 +193,45 @@ var _ = Describe("Cluster Refresh Status Preprocessor", func() {
}
})
})

Context("Recalculate operator dependencies", func() {
var validationContext *clusterPreprocessContext

BeforeEach(func() {
createCluster()
validationContext = newClusterValidationContext(cluster, db)
})

AfterEach(func() {
deleteCluster()
})

It("Adds new dependency", func() {
// Prepare the operators API so that it will add a new operator dependency:
mockAddedOperatorDependencies(
&models.MonitoredOperator{
Name: "myoperator",
},
)
mockOperatorValidationsSuccess()
_, _, err := preprocessor.preprocess(ctx, validationContext)
Expect(err).ToNot(HaveOccurred())

// Check that the new dependency has been added to the cluster in memory:
var operator *models.MonitoredOperator
for _, current := range cluster.MonitoredOperators {
if current.Name == "myoperator" {
operator = current
}
}
Expect(operator).ToNot(BeNil())

// Check that the new dependency has been saved to the database:
err = db.Where(&models.MonitoredOperator{
ClusterID: clusterID,
Name: "myoperator",
}).First(&models.MonitoredOperator{}).Error
Expect(err).ToNot(HaveOccurred())
})
})
})
Loading