diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 6ce080b3..76cb378e 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -41,6 +41,9 @@ spec: - --provider-config=/etc/kruise-game/config.toml image: controller:latest name: manager + ports: + - name: https + containerPort: 8080 securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml index d19136ae..9b8047b7 100644 --- a/config/prometheus/monitor.yaml +++ b/config/prometheus/monitor.yaml @@ -11,10 +11,6 @@ spec: endpoints: - path: /metrics port: https - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true selector: matchLabels: control-plane: controller-manager diff --git a/main.go b/main.go index 6d2e3d0b..0e681fd5 100644 --- a/main.go +++ b/main.go @@ -22,7 +22,10 @@ import ( kruiseV1beta1 "github.com/openkruise/kruise-api/apps/v1beta1" "github.com/openkruise/kruise-game/cloudprovider" cpmanager "github.com/openkruise/kruise-game/cloudprovider/manager" + kruisegameclientset "github.com/openkruise/kruise-game/pkg/client/clientset/versioned" + kruisegamevisions "github.com/openkruise/kruise-game/pkg/client/informers/externalversions" controller "github.com/openkruise/kruise-game/pkg/controllers" + "github.com/openkruise/kruise-game/pkg/metrics" "github.com/openkruise/kruise-game/pkg/webhook" "os" "time" @@ -97,7 +100,8 @@ func main() { } } - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + restConfig := ctrl.GetConfigOrDie() + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ Scheme: scheme, MetricsBindAddress: metricsAddr, Port: 9443, @@ -164,6 +168,20 @@ func main() { } }() + kruisegameInformerFactory := kruisegamevisions.NewSharedInformerFactory(kruisegameclientset.NewForConfigOrDie(restConfig), 30*time.Second) + metricsController, err := metrics.NewController(kruisegameInformerFactory) + if err != nil { + setupLog.Error(err, "unable to create metrics controller") + os.Exit(1) + } + kruisegameInformerFactory.Start(signal.Done()) + go func() { + if metricsController.Run(signal) != nil { + setupLog.Error(err, "unable to setup metrics controller") + os.Exit(1) + } + }() + setupLog.Info("starting kruise-game-manager") if err := mgr.Start(signal); err != nil { diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go new file mode 100644 index 00000000..9dcbebc9 --- /dev/null +++ b/pkg/metrics/controller.go @@ -0,0 +1,232 @@ +/* +Copyright 2023 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "errors" + "fmt" + gamekruisev1alpha1 "github.com/openkruise/kruise-game/apis/v1alpha1" + kruisegamevisions "github.com/openkruise/kruise-game/pkg/client/informers/externalversions" + kruisegamelister "github.com/openkruise/kruise-game/pkg/client/listers/apis/v1alpha1" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + "sync" + "time" +) + +type Controller struct { + gameServerLister kruisegamelister.GameServerLister + gameServerSetLister kruisegamelister.GameServerSetLister + gameServerSynced cache.InformerSynced + gameServerSetSynced cache.InformerSynced + stateLock sync.Mutex + opsStateLock sync.Mutex + gameServerStateLastChange map[string]float64 + gameServerOpsStateLastChange map[string]float64 +} + +func NewController(kruisegameInformerFactory kruisegamevisions.SharedInformerFactory) (*Controller, error) { + gameServer := kruisegameInformerFactory.Game().V1alpha1().GameServers() + gsInformer := gameServer.Informer() + + gameServerSet := kruisegameInformerFactory.Game().V1alpha1().GameServerSets() + gssInformer := gameServerSet.Informer() + + c := &Controller{ + gameServerLister: gameServer.Lister(), + gameServerSetLister: gameServerSet.Lister(), + gameServerSynced: gsInformer.HasSynced, + gameServerSetSynced: gssInformer.HasSynced, + stateLock: sync.Mutex{}, + opsStateLock: sync.Mutex{}, + gameServerStateLastChange: make(map[string]float64), + gameServerOpsStateLastChange: make(map[string]float64), + } + + gsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: c.recordGsWhenAdd, + UpdateFunc: c.recordGsWhenUpdate, + DeleteFunc: c.recordGsWhenDelete, + }) + + gssInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj interface{}) { + c.recordGssWhenChange(newObj) + }, + DeleteFunc: c.recordGssWhenDelete, + }) + + return c, nil +} + +func (c *Controller) recordGsWhenAdd(obj interface{}) { + gs, ok := obj.(*gamekruisev1alpha1.GameServer) + if !ok { + return + } + + c.calcDurationState(gs, "Add") + c.calcDurationOpsState(gs, "Add") + + GameServersTotal.WithLabelValues().Inc() + + state := string(gs.Status.CurrentState) + opsState := string(gs.Spec.OpsState) + GameServersStateCount.WithLabelValues(state).Inc() + GameServersOpsStateCount.WithLabelValues(opsState).Inc() + + dp := 0 + up := 0 + if gs.Status.DeletionPriority != nil { + dp = gs.Status.DeletionPriority.IntValue() + } + if gs.Status.UpdatePriority != nil { + up = gs.Status.UpdatePriority.IntValue() + } + GameServerDeletionPriority.WithLabelValues(gs.Name, gs.Namespace).Set(float64(dp)) + GameServerUpdatePriority.WithLabelValues(gs.Name, gs.Namespace).Set(float64(up)) +} + +func (c *Controller) recordGsWhenUpdate(oldObj, newObj interface{}) { + oldGs, ok := oldObj.(*gamekruisev1alpha1.GameServer) + if !ok { + return + } + + newGs, ok := newObj.(*gamekruisev1alpha1.GameServer) + if !ok { + return + } + + oldState := string(oldGs.Status.CurrentState) + oldOpsState := string(oldGs.Spec.OpsState) + newState := string(newGs.Status.CurrentState) + newOpsState := string(newGs.Spec.OpsState) + if oldState != newState { + GameServersStateCount.WithLabelValues(newState).Inc() + GameServersStateCount.WithLabelValues(oldState).Dec() + GameServersStateDuration.WithLabelValues(newGs.Name, newGs.Namespace, oldState).Observe(c.calcDurationState(newGs, "Update")) + } + if oldOpsState != newOpsState { + GameServersOpsStateCount.WithLabelValues(newOpsState).Inc() + GameServersOpsStateCount.WithLabelValues(oldOpsState).Dec() + GameServersOpsStateDuration.WithLabelValues(newGs.Name, newGs.Namespace, oldOpsState).Observe(c.calcDurationOpsState(newGs, "Update")) + } + + newDp := 0 + newUp := 0 + if newGs.Status.DeletionPriority != oldGs.Status.DeletionPriority { + newDp = newGs.Status.DeletionPriority.IntValue() + } + if newGs.Status.UpdatePriority != oldGs.Status.UpdatePriority { + newUp = newGs.Status.UpdatePriority.IntValue() + } + GameServerDeletionPriority.WithLabelValues(newGs.Name, newGs.Namespace).Set(float64(newDp)) + GameServerUpdatePriority.WithLabelValues(newGs.Name, newGs.Namespace).Set(float64(newUp)) +} + +func (c *Controller) recordGsWhenDelete(obj interface{}) { + gs, ok := obj.(*gamekruisev1alpha1.GameServer) + if !ok { + return + } + + state := string(gs.Status.CurrentState) + opsState := string(gs.Spec.OpsState) + + GameServersStateDuration.WithLabelValues(gs.Name, gs.Namespace, state).Observe(c.calcDurationState(gs, "Delete")) + GameServersOpsStateDuration.WithLabelValues(gs.Name, gs.Namespace, opsState).Observe(c.calcDurationOpsState(gs, "Delete")) + + GameServersStateCount.WithLabelValues(state).Dec() + GameServersOpsStateCount.WithLabelValues(opsState).Dec() + GameServerDeletionPriority.DeleteLabelValues(gs.Name, gs.Namespace) + GameServerUpdatePriority.DeleteLabelValues(gs.Name, gs.Namespace) +} + +func (c *Controller) recordGssWhenChange(obj interface{}) { + gss, ok := obj.(*gamekruisev1alpha1.GameServerSet) + if !ok { + return + } + + GameServerSetsReplicasCount.WithLabelValues(gss.Name, gss.Namespace, "current").Set(float64(gss.Status.CurrentReplicas)) + GameServerSetsReplicasCount.WithLabelValues(gss.Name, gss.Namespace, "ready").Set(float64(gss.Status.ReadyReplicas)) + GameServerSetsReplicasCount.WithLabelValues(gss.Name, gss.Namespace, "available").Set(float64(gss.Status.AvailableReplicas)) + GameServerSetsReplicasCount.WithLabelValues(gss.Name, gss.Namespace, "maintaining").Set(float64(*gss.Status.MaintainingReplicas)) + GameServerSetsReplicasCount.WithLabelValues(gss.Name, gss.Namespace, "waitToBeDeleted").Set(float64(*gss.Status.WaitToBeDeletedReplicas)) +} + +func (c *Controller) recordGssWhenDelete(obj interface{}) { + gss, ok := obj.(*gamekruisev1alpha1.GameServerSet) + if !ok { + return + } + + GameServerSetsReplicasCount.DeleteLabelValues(gss.Name, gss.Namespace, "current") + GameServerSetsReplicasCount.DeleteLabelValues(gss.Name, gss.Namespace, "ready") + GameServerSetsReplicasCount.DeleteLabelValues(gss.Name, gss.Namespace, "available") + GameServerSetsReplicasCount.DeleteLabelValues(gss.Name, gss.Namespace, "maintaining") + GameServerSetsReplicasCount.DeleteLabelValues(gss.Name, gss.Namespace, "waitToBeDeleted") +} + +func (c *Controller) Run(ctx context.Context) error { + klog.Info("Wait for metrics controller cache sync") + if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced, c.gameServerSetSynced) { + return errors.New("failed to wait for caches to sync") + } + <-ctx.Done() + return nil +} + +func (c *Controller) calcDurationState(newGs *gamekruisev1alpha1.GameServer, action string) float64 { + currentTime := time.Now().UTC().Sub(newGs.ObjectMeta.CreationTimestamp.Local().UTC()).Seconds() + gsKey := fmt.Sprintf("%s/%s", newGs.ObjectMeta.Namespace, newGs.ObjectMeta.Name) + + c.stateLock.Lock() + defer c.stateLock.Unlock() + duration := 0.0 + if action == "Add" { + c.gameServerStateLastChange[gsKey] = currentTime + } else { + duration = currentTime - c.gameServerStateLastChange[gsKey] + c.gameServerStateLastChange[gsKey] = currentTime + } + if action == "Delete" { + delete(c.gameServerStateLastChange, gsKey) + } + return duration +} + +func (c *Controller) calcDurationOpsState(newGs *gamekruisev1alpha1.GameServer, action string) float64 { + currentTime := time.Now().UTC().Sub(newGs.ObjectMeta.CreationTimestamp.Local().UTC()).Seconds() + gsKey := fmt.Sprintf("%s/%s", newGs.ObjectMeta.Namespace, newGs.ObjectMeta.Name) + + c.opsStateLock.Lock() + defer c.opsStateLock.Unlock() + duration := 0.0 + if action == "Add" { + c.gameServerOpsStateLastChange[gsKey] = currentTime + } else { + duration = currentTime - c.gameServerOpsStateLastChange[gsKey] + c.gameServerOpsStateLastChange[gsKey] = currentTime + } + if action == "Delete" { + delete(c.gameServerOpsStateLastChange, gsKey) + } + return duration +} diff --git a/pkg/metrics/prometheus_metrics.go b/pkg/metrics/prometheus_metrics.go new file mode 100644 index 00000000..5c9d2bc8 --- /dev/null +++ b/pkg/metrics/prometheus_metrics.go @@ -0,0 +1,92 @@ +/* +Copyright 2023 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +func init() { + metrics.Registry.MustRegister(GameServersStateCount) + metrics.Registry.MustRegister(GameServersOpsStateCount) + metrics.Registry.MustRegister(GameServersTotal) + metrics.Registry.MustRegister(GameServerSetsReplicasCount) + metrics.Registry.MustRegister(GameServersStateDuration) + metrics.Registry.MustRegister(GameServersOpsStateDuration) + metrics.Registry.MustRegister(GameServerDeletionPriority) + metrics.Registry.MustRegister(GameServerUpdatePriority) +} + +var ( + GameServersStateCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "okg_gameservers_state_count", + Help: "The number of gameservers per state", + }, + []string{"state"}, + ) + GameServersOpsStateCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "okg_gameservers_opsState_count", + Help: "The number of gameservers per opsState", + }, + []string{"opsState"}, + ) + GameServersTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "okg_gameservers_total", + Help: "The total of gameservers", + }, + []string{}, + ) + GameServerSetsReplicasCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "okg_gameserversets_replicas_count", + Help: "The number of replicas per gameserverset)", + }, + []string{"gssName", "gssNs", "gsStatus"}, + ) + GameServersStateDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "okg_gameservers_state_duration", + Help: "The distribution of gameserver state duration in seconds.)", + }, + []string{"gsName", "gsNs", "state"}, + ) + GameServersOpsStateDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "okg_gameservers_opsState_duration", + Help: "The distribution of gameserver opsState duration in seconds.)", + }, + []string{"gsName", "gsNs", "opsState"}, + ) + GameServerDeletionPriority = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "okg_gameserver_deletion_priority", + Help: "The deletionPriority of gameserver.)", + }, + []string{"gsName", "gsNs"}, + ) + GameServerUpdatePriority = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "okg_gameserver_update_priority", + Help: "The updatePriority of gameserver.)", + }, + []string{"gsName", "gsNs"}, + ) +)