diff --git a/bindata/manifests/metrics-exporter/metrics-daemonset.yaml b/bindata/manifests/metrics-exporter/metrics-daemonset.yaml new file mode 100644 index 000000000..6e433f07d --- /dev/null +++ b/bindata/manifests/metrics-exporter/metrics-daemonset.yaml @@ -0,0 +1,118 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: sriov-network-metrics-exporter + name: sriov-network-metrics-exporter + namespace: {{.Namespace}} +spec: + selector: + matchLabels: + app: sriov-network-metrics-exporter + template: + metadata: + labels: + app: sriov-network-metrics-exporter + spec: + hostNetwork: true + serviceAccountName: metrics-exporter-sa + {{- if .ImagePullSecrets }} + imagePullSecrets: + {{- range .ImagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + containers: + - name: metrics-exporter + args: + - --web.listen-address=127.0.0.1:{{.MetricsExporterPort}} + - --path.kubecgroup=/sys/fs/cgroup + - --path.sysbuspci=/host/sys/bus/pci/devices/ + - --path.sysclassnet=/host/sys/class/net/ + - --path.cpucheckpoint=/host/cpu_manager_state + - --path.kubeletsocket=/host/kubelet.sock + - --collector.kubepoddevice=true + - --collector.vfstatspriority=netlink,sysfs + image: {{.Image}} + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 100Mi + cpu: 100m + securityContext: + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + volumeMounts: + - mountPath: /host/kubelet.sock + name: kubeletsocket + - mountPath: /host/sys/bus/pci/devices + name: sysbuspcidevices + readOnly: true + - mountPath: /host/sys/devices + name: sysdevices + readOnly: true + - mountPath: /host/sys/class/net + name: sysclassnet + readOnly: true + - mountPath: /host/cpu_manager_state + name: cpucheckpoint + readOnly: true + - name: kube-rbac-proxy + image: '{{.MetricsExporterKubeRbacProxyImage}}' + imagePullPolicy: IfNotPresent + args: + - --logtostderr + - --secure-listen-address=[$(HOST_IP)]:{{.MetricsExporterPort}} + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + - --upstream=http://127.0.0.1:{{.MetricsExporterPort}}/ + - --tls-private-key-file=/etc/metrics/tls.key + - --tls-cert-file=/etc/metrics/tls.crt + ports: + - containerPort: {{.MetricsExporterPort}} + name: https-metrics + env: + - name: HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + resources: + requests: + cpu: 10m + memory: 20Mi + volumeMounts: + - name: metrics-certs + mountPath: /etc/metrics + readOnly: true + nodeSelector: + {{- range $key, $value := .NodeSelectorField }} + {{ $key }}: {{ $value }} + {{- end }} + restartPolicy: Always + volumes: + - hostPath: + path: /var/lib/kubelet/pod-resources/kubelet.sock + type: "Socket" + name: kubeletsocket + - hostPath: + path: /var/lib/kubelet/cpu_manager_state + type: "File" + name: cpucheckpoint + - hostPath: + path: /sys/class/net + type: "Directory" + name: sysclassnet + - hostPath: + path: /sys/bus/pci/devices + type: "Directory" + name: sysbuspcidevices + - hostPath: + path: /sys/devices + type: "Directory" + name: sysdevices + - name: metrics-certs + secret: + defaultMode: 420 + secretName: {{ .MetricsExporterSecretName }} diff --git a/bindata/manifests/metrics-exporter/metrics-rbac.yaml b/bindata/manifests/metrics-exporter/metrics-rbac.yaml new file mode 100644 index 000000000..0866f9613 --- /dev/null +++ b/bindata/manifests/metrics-exporter/metrics-rbac.yaml @@ -0,0 +1,66 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metrics-exporter-sa + namespace: {{.Namespace}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: metrics-exporter-role + namespace: {{.Namespace}} +rules: + - apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: metrics-exporter-rb + namespace: {{.Namespace}} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: metrics-exporter-role +subjects: + - kind: ServiceAccount + name: metrics-exporter-sa + namespace: {{.Namespace}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: sriov-metrics-kube-rbac-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sriov-metrics-kube-rbac-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sriov-metrics-kube-rbac-role +subjects: +- kind: ServiceAccount + name: metrics-exporter-sa + namespace: {{.Namespace}} diff --git a/bindata/manifests/metrics-exporter/metrics-service.yaml b/bindata/manifests/metrics-exporter/metrics-service.yaml new file mode 100644 index 000000000..69e2349ad --- /dev/null +++ b/bindata/manifests/metrics-exporter/metrics-service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: sriov-network-metrics-exporter-service + namespace: {{.Namespace}} + annotations: + prometheus.io/target: "true" + {{- if eq .ClusterType "openshift" }} + service.beta.openshift.io/serving-cert-secret-name: {{ .MetricsExporterSecretName }} + {{- end }} + labels: + name: sriov-network-metrics-exporter-service +spec: + selector: + app: sriov-network-metrics-exporter + ports: + - protocol: TCP + name: sriov-network-metrics + port: {{ .MetricsExporterPort }} + targetPort: {{ .MetricsExporterPort }} diff --git a/controllers/helper.go b/controllers/helper.go index 5d169b2c5..9ff735473 100644 --- a/controllers/helper.go +++ b/controllers/helper.go @@ -376,3 +376,24 @@ func syncDaemonSet(ctx context.Context, client k8sclient.Client, scheme *runtime } return nil } + +func updateDaemonsetNodeSelector(obj *uns.Unstructured, nodeSelector map[string]string) error { + if len(nodeSelector) == 0 { + return nil + } + + ds := &appsv1.DaemonSet{} + scheme := kscheme.Scheme + err := scheme.Convert(obj, ds, nil) + if err != nil { + return fmt.Errorf("failed to convert Unstructured [%s] to DaemonSet: %v", obj.GetName(), err) + } + + ds.Spec.Template.Spec.NodeSelector = nodeSelector + + err = scheme.Convert(ds, obj, nil) + if err != nil { + return fmt.Errorf("failed to convert DaemonSet [%s] to Unstructured: %v", obj.GetName(), err) + } + return nil +} diff --git a/controllers/sriovoperatorconfig_controller.go b/controllers/sriovoperatorconfig_controller.go index 9e90a5688..bd2b6c70f 100644 --- a/controllers/sriovoperatorconfig_controller.go +++ b/controllers/sriovoperatorconfig_controller.go @@ -122,6 +122,10 @@ func (r *SriovOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl. return reconcile.Result{}, err } + if err = r.syncMetricsExporter(ctx, defaultConfig); err != nil { + return reconcile.Result{}, err + } + // For Openshift we need to create the systemd files using a machine config if vars.ClusterType == consts.ClusterTypeOpenshift { // TODO: add support for hypershift as today there is no MCO on hypershift clusters @@ -199,27 +203,64 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context, } // Sync DaemonSets for _, obj := range objs { - if obj.GetKind() == "DaemonSet" && len(dc.Spec.ConfigDaemonNodeSelector) > 0 { - scheme := kscheme.Scheme - ds := &appsv1.DaemonSet{} - err = scheme.Convert(obj, ds, nil) + if obj.GetKind() == "DaemonSet" { + err = updateDaemonsetNodeSelector(obj, dc.Spec.ConfigDaemonNodeSelector) if err != nil { - logger.Error(err, "Fail to convert to DaemonSet") return err } - ds.Spec.Template.Spec.NodeSelector = dc.Spec.ConfigDaemonNodeSelector - err = scheme.Convert(ds, obj, nil) + } + + err = r.syncK8sResource(ctx, dc, obj) + if err != nil { + logger.Error(err, "Couldn't sync SR-IOV daemons objects") + return err + } + } + return nil +} + +func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context, dc *sriovnetworkv1.SriovOperatorConfig) error { + logger := log.Log.WithName("syncMetricsExporter") + logger.V(1).Info("Start to sync metrics exporter") + + data := render.MakeRenderData() + data.Data["Image"] = os.Getenv("METRICS_EXPORTER_IMAGE") + data.Data["Namespace"] = vars.Namespace + data.Data["ImagePullSecrets"] = GetImagePullSecrets() + data.Data["MetricsExporterSecretName"] = os.Getenv("METRICS_EXPORTER_SECRET_NAME") + data.Data["MetricsExporterPort"] = os.Getenv("METRICS_EXPORTER_PORT") + data.Data["MetricsExporterKubeRbacProxyImage"] = os.Getenv("METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE") + data.Data["ClusterType"] = vars.ClusterType + data.Data["NodeSelectorField"] = GetDefaultNodeSelector() + if dc.Spec.ConfigDaemonNodeSelector != nil { + data.Data["NodeSelectorField"] = dc.Spec.ConfigDaemonNodeSelector + } + + objs, err := render.RenderDir(consts.MetricsExporterPath, &data) + if err != nil { + logger.Error(err, "Fail to render metrics exporter manifests") + return err + } + + deployMetricsExporter, ok := dc.Spec.FeatureGates[consts.MetricsExporterFeatureGate] + if ok && deployMetricsExporter { + for _, obj := range objs { + err = r.syncK8sResource(ctx, dc, obj) if err != nil { - logger.Error(err, "Fail to convert to Unstructured") + logger.Error(err, "Couldn't sync metrics exporter objects") return err } } - err = r.syncK8sResource(ctx, dc, obj) + return nil + } + + for _, obj := range objs { + err = r.deleteK8sResource(ctx, obj) if err != nil { - logger.Error(err, "Couldn't sync SR-IoV daemons objects") return err } } + return nil } @@ -387,7 +428,7 @@ func (r SriovOperatorConfigReconciler) setLabelInsideObject(ctx context.Context, } err := r.syncK8sResource(ctx, cr, obj) if err != nil { - logger.Error(err, "Couldn't sync SR-IoV daemons objects") + logger.Error(err, "Couldn't sync SR-IOV daemons objects") return err } } diff --git a/controllers/sriovoperatorconfig_controller_test.go b/controllers/sriovoperatorconfig_controller_test.go index d7cc7fcb9..c642765f4 100644 --- a/controllers/sriovoperatorconfig_controller_test.go +++ b/controllers/sriovoperatorconfig_controller_test.go @@ -7,6 +7,7 @@ import ( admv1 "k8s.io/api/admissionregistration/v1" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" @@ -327,5 +328,43 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() { }) Expect(err).ToNot(HaveOccurred()) }) + + It("should deploy the metrics-exporter when the feature gate is enabled", func() { + config := &sriovnetworkv1.SriovOperatorConfig{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: testNamespace, Name: "default"}, config)).NotTo(HaveOccurred()) + + daemonSet := &appsv1.DaemonSet{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: "sriov-metrics-exporter", Namespace: testNamespace}, daemonSet) + Expect(err).To(HaveOccurred()) + Expect(errors.IsNotFound(err)).To(BeTrue()) + + By("Turn `metricsExporter` feature gate on") + config.Spec.FeatureGates = map[string]bool{constants.MetricsExporterFeatureGate: true} + err = k8sClient.Update(ctx, config) + Expect(err).NotTo(HaveOccurred()) + + DeferCleanup(func() { + config.Spec.FeatureGates = map[string]bool{} + err = k8sClient.Update(ctx, config) + Expect(err).NotTo(HaveOccurred()) + }) + + err = util.WaitForNamespacedObject(&appsv1.DaemonSet{}, k8sClient, testNamespace, "sriov-network-metrics-exporter", util.RetryInterval, util.APITimeout) + Expect(err).NotTo(HaveOccurred()) + + err = util.WaitForNamespacedObject(&v1.Service{}, k8sClient, testNamespace, "sriov-network-metrics-exporter-service", util.RetryInterval, util.APITimeout) + Expect(err).ToNot(HaveOccurred()) + + By("Turn `metricsExporter` feature gate off") + config.Spec.FeatureGates = map[string]bool{} + err = k8sClient.Update(ctx, config) + + err = util.WaitForNamespacedObjectDeleted(&appsv1.DaemonSet{}, k8sClient, testNamespace, "sriov-network-metrics-exporter", util.RetryInterval, util.APITimeout) + Expect(err).NotTo(HaveOccurred()) + + err = util.WaitForNamespacedObjectDeleted(&v1.Service{}, k8sClient, testNamespace, "sriov-network-metrics-exporter-service", util.RetryInterval, util.APITimeout) + Expect(err).ToNot(HaveOccurred()) + }) + }) }) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 85f9b6075..b830e2fc2 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -127,6 +127,14 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) err = os.Setenv("OPERATOR_NAME", "sriov-network-operator") Expect(err).NotTo(HaveOccurred()) + err = os.Setenv("METRICS_EXPORTER_IMAGE", "mock-image") + Expect(err).NotTo(HaveOccurred()) + err = os.Setenv("METRICS_EXPORTER_SECRET_NAME", "metrics-exporter-cert") + Expect(err).NotTo(HaveOccurred()) + err = os.Setenv("METRICS_EXPORTER_PORT", "9110") + Expect(err).NotTo(HaveOccurred()) + err = os.Setenv("METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE", "mock-image") + Expect(err).NotTo(HaveOccurred()) By("bootstrapping test environment") testEnv = &envtest.Environment{ diff --git a/deploy/operator.yaml b/deploy/operator.yaml index 576dd3f34..18eeb434c 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -68,6 +68,10 @@ spec: value: $SRIOV_NETWORK_CONFIG_DAEMON_IMAGE - name: SRIOV_NETWORK_WEBHOOK_IMAGE value: $SRIOV_NETWORK_WEBHOOK_IMAGE + - name: METRICS_EXPORTER_IMAGE + value: $METRICS_EXPORTER_IMAGE + - name: METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE + value: $METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE - name: RESOURCE_PREFIX value: $RESOURCE_PREFIX - name: DEV_MODE @@ -100,3 +104,7 @@ spec: value: $ADMISSION_CONTROLLERS_CERTIFICATES_OPERATOR_CA_CRT - name: ADMISSION_CONTROLLERS_CERTIFICATES_INJECTOR_CA_CRT value: $ADMISSION_CONTROLLERS_CERTIFICATES_INJECTOR_CA_CRT + - name: METRICS_EXPORTER_SECRET_NAME + value: $METRICS_EXPORTER_SECRET_NAME + - name: METRICS_EXPORTER_PORT + value: "$METRICS_EXPORTER_PORT" diff --git a/deployment/sriov-network-operator-chart/README.md b/deployment/sriov-network-operator-chart/README.md index ee5192473..ddf962305 100644 --- a/deployment/sriov-network-operator-chart/README.md +++ b/deployment/sriov-network-operator-chart/README.md @@ -84,6 +84,8 @@ We have introduced the following Chart parameters. | `operator.resourcePrefix` | string | `openshift.io` | Device plugin resource prefix | | `operator.cniBinPath` | string | `/opt/cni/bin` | Path for CNI binary | | `operator.clustertype` | string | `kubernetes` | Cluster environment type | +| `operator.metricsExporter.port` | string | `9110` | Port where the Network Metrics Exporter listen | +| `operator.metricsExporter.certificates.secretName` | string | `metrics-exporter-cert` | Secret name to serve metrics via TLS. The secret must have the same fields as `operator.admissionControllers.certificates.secretNames` | #### Admission Controllers parameters @@ -140,3 +142,5 @@ This section contains general parameters that apply to both the operator and dae | `images.sriovDevicePlugin` | SR-IOV device plugin image | | `images.resourcesInjector` | Resources Injector image | | `images.webhook` | Operator Webhook image | +| `images.metricsExporter` | Network Metrics Exporter image | +| `images.metricsExporterKubeRbacProxy` | Kube RBAC Proxy image used for metrics exporter | diff --git a/deployment/sriov-network-operator-chart/templates/operator.yaml b/deployment/sriov-network-operator-chart/templates/operator.yaml index 1200e34ac..52da05721 100644 --- a/deployment/sriov-network-operator-chart/templates/operator.yaml +++ b/deployment/sriov-network-operator-chart/templates/operator.yaml @@ -68,6 +68,16 @@ spec: value: {{ .Values.images.sriovConfigDaemon }} - name: SRIOV_NETWORK_WEBHOOK_IMAGE value: {{ .Values.images.webhook }} + - name: METRICS_EXPORTER_IMAGE + value: {{ .Values.images.metricsExporter }} + - name: METRICS_EXPORTER_SECRET_NAME + value: {{ .Values.operator.admissionControllers.certificates.secretNames.metricsExporter }} + - name: METRICS_EXPORTER_PORT + value: {{ .Values.operator.metricsExporter.port }} + - name: METRICS_EXPORTER_SECRET_NAME + value: {{ .Values.operator.metricsExporter.certificates.secretName }} + - name: METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE + value: {{ .Values.images.metricsExporterKubeRbacProxy }} - name: RESOURCE_PREFIX value: {{ .Values.operator.resourcePrefix }} - name: IMAGE_PULL_SECRETS diff --git a/deployment/sriov-network-operator-chart/values.yaml b/deployment/sriov-network-operator-chart/values.yaml index 28a4d06dc..89de1019d 100644 --- a/deployment/sriov-network-operator-chart/values.yaml +++ b/deployment/sriov-network-operator-chart/values.yaml @@ -27,6 +27,10 @@ operator: resourcePrefix: "openshift.io" cniBinPath: "/opt/cni/bin" clusterType: "kubernetes" + metricsExporter: + port: "9110" + certificates: + secretName: "metrics-exporter-cert" admissionControllers: enabled: false certificates: @@ -102,5 +106,7 @@ images: sriovDevicePlugin: ghcr.io/k8snetworkplumbingwg/sriov-network-device-plugin resourcesInjector: ghcr.io/k8snetworkplumbingwg/network-resources-injector webhook: ghcr.io/k8snetworkplumbingwg/sriov-network-operator-webhook + metricsExporter: ghcr.io/k8snetworkplumbingwg/sriov-network-metrics-exporter + metricsExporterKubeRbacProxy: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0 imagePullSecrets: [] diff --git a/hack/deploy-wait.sh b/hack/deploy-wait.sh index 5a0af6023..0c9093a86 100755 --- a/hack/deploy-wait.sh +++ b/hack/deploy-wait.sh @@ -20,7 +20,7 @@ done if ! $ready; then echo "Timed out waiting for features to be ready" - kubectl get nodes - kubectl cluster-info dump -n ${NAMESPACE} + ${OPERATOR_EXEC} get nodes + ${OPERATOR_EXEC} cluster-info dump -n ${NAMESPACE} exit 1 fi diff --git a/hack/env.sh b/hack/env.sh index 5b08d7b8b..55b7179e9 100755 --- a/hack/env.sh +++ b/hack/env.sh @@ -7,16 +7,20 @@ if [ -z $SKIP_VAR_SET ]; then export NETWORK_RESOURCES_INJECTOR_IMAGE=${NETWORK_RESOURCES_INJECTOR_IMAGE:-ghcr.io/k8snetworkplumbingwg/network-resources-injector} export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE=${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-operator-config-daemon} export SRIOV_NETWORK_WEBHOOK_IMAGE=${SRIOV_NETWORK_WEBHOOK_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-operator-webhook} + export METRICS_EXPORTER_IMAGE=${METRICS_EXPORTER_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-metrics-exporter} export SRIOV_NETWORK_OPERATOR_IMAGE=${SRIOV_NETWORK_OPERATOR_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-network-operator} + export METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE=${METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE:-gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0} else # ensure that OVS_CNI_IMAGE is set, empty string is a valid value OVS_CNI_IMAGE=${OVS_CNI_IMAGE:-} + METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE=${METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE:-} [ -z $SRIOV_CNI_IMAGE ] && echo "SRIOV_CNI_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_INFINIBAND_CNI_IMAGE ] && echo "SRIOV_INFINIBAND_CNI_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_DEVICE_PLUGIN_IMAGE ] && echo "SRIOV_DEVICE_PLUGIN_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $NETWORK_RESOURCES_INJECTOR_IMAGE ] && echo "NETWORK_RESOURCES_INJECTOR_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_NETWORK_CONFIG_DAEMON_IMAGE ] && echo "SRIOV_NETWORK_CONFIG_DAEMON_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_NETWORK_WEBHOOK_IMAGE ] && echo "SRIOV_NETWORK_WEBHOOK_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 + [ -z $METRICS_EXPORTER_IMAGE ] && echo "METRICS_EXPORTER_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 [ -z $SRIOV_NETWORK_OPERATOR_IMAGE ] && echo "SRIOV_NETWORK_OPERATOR_IMAGE is empty but SKIP_VAR_SET is set" && exit 1 fi @@ -35,3 +39,5 @@ export ADMISSION_CONTROLLERS_CERTIFICATES_OPERATOR_CA_CRT=${ADMISSION_CONTROLLER export ADMISSION_CONTROLLERS_CERTIFICATES_INJECTOR_CA_CRT=${ADMISSION_CONTROLLERS_CERTIFICATES_INJECTOR_CA_CRT:-""} export DEV_MODE=${DEV_MODE:-"FALSE"} export OPERATOR_LEADER_ELECTION_ENABLE=${OPERATOR_LEADER_ELECTION_ENABLE:-"false"} +export METRICS_EXPORTER_SECRET_NAME=${METRICS_EXPORTER_SECRET_NAME:-"metrics-exporter-cert"} +export METRICS_EXPORTER_PORT=${METRICS_EXPORTER_PORT:-"9110"} diff --git a/hack/release/chart-update.sh b/hack/release/chart-update.sh index 539cf55b2..282521020 100755 --- a/hack/release/chart-update.sh +++ b/hack/release/chart-update.sh @@ -47,6 +47,7 @@ SRIOV_CNI_TAG=$(get_latest_github_tag k8snetworkplumbingwg sriov-cni) OVS_CNI_TAG=$(get_latest_github_tag k8snetworkplumbingwg ovs-cni) NETWORK_RESOURCE_INJECTOR_TAG=$(get_latest_github_tag k8snetworkplumbingwg network-resources-injector) SRIOV_DEVICE_PLUGIN_TAG=$(get_latest_github_tag k8snetworkplumbingwg sriov-network-device-plugin) +METRICS_EXPORTER_TAG=$(get_latest_github_tag k8snetworkplumbingwg sriov-network-metrics-exporter) # patch values.yaml in-place @@ -62,6 +63,7 @@ $YQ_CMD -i ".images.ibSriovCni = \"ghcr.io/k8snetworkplumbingwg/ib-sriov-cni:${I $YQ_CMD -i ".images.ovsCni = \"ghcr.io/k8snetworkplumbingwg/ovs-cni-plugin:${OVS_CNI_TAG}\"" ${HELM_VALUES} $YQ_CMD -i ".images.sriovDevicePlugin = \"ghcr.io/k8snetworkplumbingwg/sriov-network-device-plugin:${SRIOV_DEVICE_PLUGIN_TAG}\"" ${HELM_VALUES} $YQ_CMD -i ".images.resourcesInjector = \"ghcr.io/k8snetworkplumbingwg/network-resources-injector:${NETWORK_RESOURCE_INJECTOR_TAG}\"" ${HELM_VALUES} +$YQ_CMD -i ".images.metricsExporter = \"ghcr.io/k8snetworkplumbingwg/sriov-network-metrics-exporter:${METRICS_EXPORTER_TAG}\"" ${HELM_VALUES} # patch Chart.yaml in-place $YQ_CMD -i ".version = \"${OPERATOR_TAG#"v"}\"" ${HELM_CHART} diff --git a/hack/run-e2e-conformance-virtual-cluster.sh b/hack/run-e2e-conformance-virtual-cluster.sh index ab6d6e9af..70752b5af 100755 --- a/hack/run-e2e-conformance-virtual-cluster.sh +++ b/hack/run-e2e-conformance-virtual-cluster.sh @@ -320,6 +320,10 @@ if [[ -v LOCAL_NETWORK_RESOURCES_INJECTOR_IMAGE ]]; then podman_tag_and_push ${LOCAL_NETWORK_RESOURCES_INJECTOR_IMAGE} ${NETWORK_RESOURCES_INJECTOR_IMAGE} fi +if [[ -v LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE ]]; then + export METRICS_EXPORTER_IMAGE="$controller_ip:5000/sriov-network-metrics-exporter:latest" + podman_tag_and_push ${LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE} ${METRICS_EXPORTER_IMAGE} +fi # remove the crio bridge and let flannel to recreate kcli ssh $cluster_name-ctlplane-0 << EOF @@ -415,6 +419,21 @@ spec: kind: Issuer name: selfsigned-issuer secretName: operator-webhook-cert +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: metrics-exporter-cert + namespace: ${NAMESPACE} +spec: + commonName: sriov-network-metrics-exporter-service.svc + dnsNames: + - sriov-network-metrics-exporter-service.${NAMESPACE}.svc.cluster.local + - sriov-network-metrics-exporter-service.${NAMESPACE}.svc + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: metrics-exporter-cert EOF @@ -424,6 +443,13 @@ kubectl apply -k $root/config/crd echo "## deploying SRIOV Network Operator" hack/deploy-setup.sh $NAMESPACE +function cluster_info { + if [[ -v TEST_REPORT_PATH ]]; then + kubectl cluster-info dump --namespaces ${NAMESPACE},${MULTUS_NAMESPACE} --output-directory "${root}/${TEST_REPORT_PATH}/cluster-info" + fi +} +trap cluster_info ERR + echo "## wait for sriov operator to be ready" hack/deploy-wait.sh @@ -434,17 +460,5 @@ if [ -z $SKIP_TEST ]; then export JUNIT_OUTPUT="${root}/${TEST_REPORT_PATH}/conformance-test-report" fi - # Disable exit on error temporarily to gather cluster information - set +e SUITE=./test/conformance hack/run-e2e-conformance.sh - TEST_EXITE_CODE=$? - set -e - - if [[ -v TEST_REPORT_PATH ]]; then - kubectl cluster-info dump --namespaces ${NAMESPACE},${MULTUS_NAMESPACE} --output-directory "${root}/${TEST_REPORT_PATH}/cluster-info" - fi - - if [[ $TEST_EXITE_CODE -ne 0 ]]; then - exit $TEST_EXITE_CODE - fi fi diff --git a/hack/run-e2e-conformance-virtual-ocp.sh b/hack/run-e2e-conformance-virtual-ocp.sh index 72e295b1e..5e80d3683 100755 --- a/hack/run-e2e-conformance-virtual-ocp.sh +++ b/hack/run-e2e-conformance-virtual-ocp.sh @@ -303,9 +303,21 @@ if [[ -v LOCAL_NETWORK_RESOURCES_INJECTOR_IMAGE ]]; then export NETWORK_RESOURCES_INJECTOR_IMAGE="image-registry.openshift-image-registry.svc:5000/$NAMESPACE/network-resources-injector:latest" fi +if [[ -v LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE ]]; then + podman_tag_and_push ${LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE} "$registry/$NAMESPACE/sriov-network-metrics-exporter:latest" + export METRICS_EXPORTER_IMAGE="image-registry.openshift-image-registry.svc:5000/$NAMESPACE/sriov-network-metrics-exporter:latest" +fi + echo "## deploying SRIOV Network Operator" hack/deploy-setup.sh $NAMESPACE +function cluster_info { + if [[ -v TEST_REPORT_PATH ]]; then + kubectl cluster-info dump --namespaces ${NAMESPACE},${MULTUS_NAMESPACE} --output-directory "${root}/${TEST_REPORT_PATH}/cluster-info" + fi +} +trap cluster_info ERR + echo "## wait for sriov operator to be ready" hack/deploy-wait.sh @@ -316,17 +328,5 @@ if [ -z $SKIP_TEST ]; then export JUNIT_OUTPUT="${root}/${TEST_REPORT_PATH}/conformance-test-report" fi - # Disable exit on error temporarily to gather cluster information - set +e SUITE=./test/conformance hack/run-e2e-conformance.sh - TEST_EXITE_CODE=$? - set -e - - if [[ -v TEST_REPORT_PATH ]]; then - kubectl cluster-info dump --namespaces ${NAMESPACE},${MULTUS_NAMESPACE} --output-directory "${root}/${TEST_REPORT_PATH}/cluster-info" - fi - - if [[ $TEST_EXITE_CODE -ne 0 ]]; then - exit $TEST_EXITE_CODE - fi fi diff --git a/hack/run-e2e-test.sh b/hack/run-e2e-test.sh index df25b87e6..c7d4fea50 100755 --- a/hack/run-e2e-test.sh +++ b/hack/run-e2e-test.sh @@ -13,5 +13,6 @@ echo ${NETWORK_RESOURCES_INJECTOR_IMAGE} echo ${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE} echo ${SRIOV_NETWORK_OPERATOR_IMAGE} echo ${SRIOV_NETWORK_WEBHOOK_IMAGE} +echo ${METRICS_EXPORTER_IMAGE} envsubst < deploy/operator.yaml > deploy/operator-init.yaml go test ./test/e2e/... -root=$(pwd) -kubeconfig=$KUBECONFIG -globalMan deploy/crds/sriovnetwork.openshift.io_sriovnetworks_crd.yaml -namespacedMan deploy/operator-init.yaml -v -singleNamespace true diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index 7ae5ab41a..282f31add 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -19,6 +19,7 @@ const ( ConfigDaemonPath = "./bindata/manifests/daemon" InjectorWebHookPath = "./bindata/manifests/webhook" OperatorWebHookPath = "./bindata/manifests/operator-webhook" + MetricsExporterPath = "./bindata/manifests/metrics-exporter" SystemdServiceOcpPath = "./bindata/manifests/sriov-config-service/openshift" SystemdServiceOcpMachineConfigName = "sriov-config-service" ServiceCAConfigMapAnnotation = "service.beta.openshift.io/inject-cabundle" @@ -130,6 +131,9 @@ const ( // ResourceInjectorMatchConditionFeatureGate: switch injector to fail policy and add mactch condition // this will make the mutating webhook to be called only when a pod has 'k8s.v1.cni.cncf.io/networks' annotation ResourceInjectorMatchConditionFeatureGate = "resourceInjectorMatchCondition" + + // MetricsExporterFeatureGate: enable SriovNetworkMetricsExporter on the same node as where the config-daemon run + MetricsExporterFeatureGate = "metricsExporter" ) const ( diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 8bb599fb9..d72f6a57a 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -306,6 +306,29 @@ var _ = Describe("[sriov] operator", func() { g.Expect(newLease.Spec.HolderIdentity).ToNot(Equal(oldLease.Spec.HolderIdentity)) }, 30*time.Second, 5*time.Second).Should(Succeed()) }) + + Context("SriovNetworkMetricsExporter", func() { + It("should be deployed if the feature gate is enabled", func() { + if discovery.Enabled() { + Skip("Test unsuitable to be run in discovery mode") + } + + initialValue := isFeatureFlagEnabled("metricsExporter") + DeferCleanup(func() { + By("Restoring initial feature flag value") + setFeatureFlag("metricsExporter", initialValue) + }) + + By("Enabling `metricsExporter` feature flag") + setFeatureFlag("metricsExporter", true) + + By("Checking that a daemon is scheduled on selected node") + Eventually(func() bool { + return isDaemonsetScheduledOnNodes("node-role.kubernetes.io/worker", "app=sriov-network-metrics-exporter") + }, 1*time.Minute, 1*time.Second).Should(Equal(true)) + + }) + }) }) Describe("Generic SriovNetworkNodePolicy", func() { @@ -2530,13 +2553,17 @@ func podVFIndexInHost(hostNetPod *corev1.Pod, targetPod *corev1.Pod, interfaceNa } func daemonsScheduledOnNodes(selector string) bool { + return isDaemonsetScheduledOnNodes(selector, "app=sriov-network-config-daemon") +} + +func isDaemonsetScheduledOnNodes(nodeSelector, daemonsetLabelSelector string) bool { nn, err := clients.CoreV1Interface.Nodes().List(context.Background(), metav1.ListOptions{ - LabelSelector: selector, + LabelSelector: nodeSelector, }) Expect(err).ToNot(HaveOccurred()) nodes := nn.Items - daemons, err := clients.Pods(operatorNamespace).List(context.Background(), metav1.ListOptions{LabelSelector: "app=sriov-network-config-daemon"}) + daemons, err := clients.Pods(operatorNamespace).List(context.Background(), metav1.ListOptions{LabelSelector: daemonsetLabelSelector}) Expect(err).ToNot(HaveOccurred()) for _, d := range daemons.Items { foundNode := false @@ -2808,6 +2835,43 @@ func getOperatorConfigLogLevel() int { return cfg.Spec.LogLevel } +func isFeatureFlagEnabled(featureFlag string) bool { + cfg := sriovv1.SriovOperatorConfig{} + err := clients.Get(context.TODO(), runtimeclient.ObjectKey{ + Name: "default", + Namespace: operatorNamespace, + }, &cfg) + Expect(err).ToNot(HaveOccurred()) + + ret, ok := cfg.Spec.FeatureGates[featureFlag] + return ok && ret +} + +func setFeatureFlag(featureFlag string, value bool) { + Eventually(func(g Gomega) { + cfg := sriovv1.SriovOperatorConfig{} + err := clients.Get(context.TODO(), runtimeclient.ObjectKey{ + Name: "default", + Namespace: operatorNamespace, + }, &cfg) + g.Expect(err).ToNot(HaveOccurred()) + + if cfg.Spec.FeatureGates == nil { + cfg.Spec.FeatureGates = make(map[string]bool) + } + + previousValue, ok := cfg.Spec.FeatureGates[featureFlag] + if ok && previousValue == value { + return + } + + cfg.Spec.FeatureGates[featureFlag] = value + + err = clients.Update(context.TODO(), &cfg) + g.Expect(err).ToNot(HaveOccurred()) + }, 1*time.Minute, 5*time.Second).Should(Succeed()) +} + func getOperatorPod() corev1.Pod { podList, err := clients.Pods(operatorNamespace).List(context.Background(), metav1.ListOptions{ LabelSelector: "name=sriov-network-operator",