From 9107af903115cf2b46f7d42c08898ea83a3806e9 Mon Sep 17 00:00:00 2001 From: Yugar-1 Date: Fri, 6 Sep 2024 15:58:48 +0800 Subject: [PATCH] Add memory bandwidth exporter for AI workload. (#379) * Add memory bandwidth exporter for AI workload. Signed-off-by: Yugar-1 --- .github/workflows/pr-go-unittests.yaml | 5 +- .../memory-bandwidth-exporter/.golangci.yml | 62 +++ .../memory-bandwidth-exporter/Dockerfile | 17 + .../memory-bandwidth-exporter/Makefile | 75 +++ .../memory-bandwidth-exporter/README.md | 151 ++++++ .../memory-bandwidth-exporter/cmd/main.go | 242 ++++++++++ .../collector/class_collector.go | 191 ++++++++ .../collector/collector.go | 177 ++++++++ .../collector/common.go | 428 ++++++++++++++++++ .../collector/container_collector.go | 290 ++++++++++++ .../collector/info.go | 166 +++++++ .../collector/node_collector.go | 145 ++++++ .../config/config.sh | 9 + .../manifests/memory-bandwidth-exporter.yaml | 59 +++ .../memory-bandwidth-exporter/go.mod | 49 ++ .../memory-bandwidth-exporter/go.sum | 164 +++++++ .../info/container.go | 17 + .../oci/hooks.d/create-runtime.json | 10 + .../etc/containers/oci/hooks.d/post-stop.json | 10 + .../plugin/hook-injector.go | 225 +++++++++ .../plugin/usr/local/sbin/create-runtime.sh | 17 + .../plugin/usr/local/sbin/post-stop.sh | 10 + 22 files changed, 2518 insertions(+), 1 deletion(-) create mode 100644 kubernetes-addons/memory-bandwidth-exporter/.golangci.yml create mode 100644 kubernetes-addons/memory-bandwidth-exporter/Dockerfile create mode 100644 kubernetes-addons/memory-bandwidth-exporter/Makefile create mode 100644 kubernetes-addons/memory-bandwidth-exporter/README.md create mode 100644 kubernetes-addons/memory-bandwidth-exporter/cmd/main.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/class_collector.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/collector.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/common.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/container_collector.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/info.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/collector/node_collector.go create mode 100755 kubernetes-addons/memory-bandwidth-exporter/config/config.sh create mode 100644 kubernetes-addons/memory-bandwidth-exporter/config/manifests/memory-bandwidth-exporter.yaml create mode 100644 kubernetes-addons/memory-bandwidth-exporter/go.mod create mode 100644 kubernetes-addons/memory-bandwidth-exporter/go.sum create mode 100644 kubernetes-addons/memory-bandwidth-exporter/info/container.go create mode 100644 kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/create-runtime.json create mode 100644 kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/post-stop.json create mode 100644 kubernetes-addons/memory-bandwidth-exporter/plugin/hook-injector.go create mode 100755 kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/create-runtime.sh create mode 100755 kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/post-stop.sh diff --git a/.github/workflows/pr-go-unittests.yaml b/.github/workflows/pr-go-unittests.yaml index a22b6351..c669d4a7 100644 --- a/.github/workflows/pr-go-unittests.yaml +++ b/.github/workflows/pr-go-unittests.yaml @@ -91,6 +91,9 @@ jobs: - name: Run tests and generate coverage run: | + if [ "${{ matrix.gopath }}" == "${MBE_DIR}" ]; then + exit 0 + fi cd ${{ matrix.gopath }} go test -coverprofile=coverage.out $(go list ./... | grep -v /e2e) - ../.github/workflows/scripts/go-coverage.sh + ${{ github.workspace }}/.github/workflows/scripts/go-coverage.sh diff --git a/kubernetes-addons/memory-bandwidth-exporter/.golangci.yml b/kubernetes-addons/memory-bandwidth-exporter/.golangci.yml new file mode 100644 index 00000000..11cbd686 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/.golangci.yml @@ -0,0 +1,62 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +run: + timeout: 5m + allow-parallel-runners: true + +issues: + exclude-use-default: false + exclude-rules: + - path: _test.go + linters: + - errcheck + +linters: + disable-all: true + enable: + - depguard + - misspell + - revive + - dupl + - errcheck + - exportloopref + - goconst + - gocyclo + - gofmt + - goimports + - gosimple + - govet + - ineffassign + - lll + - misspell + - nakedret + - prealloc + - staticcheck + - typecheck + - unconvert + - unparam + - unused + + +linters-settings: + depguard: + rules: + no_exec_policy: + files: + - "!$test" + deny: + - pkg: "os/exec" + desc: "Using os/exec to run sub processes it not allowed by policy" + errcheck: + exclude-functions: + # Used in HTTP handlers, any error is handled by the server itself. + - (net/http.ResponseWriter).Write + # Never check for logger errors. + - (github.com/go-kit/log.Logger).Log + revive: + rules: + # https://github.com/mgechev/revive/blob/master/RULES_DESCRIPTIONS.md#unused-parameter + - name: unused-parameter + severity: warning + disabled: true diff --git a/kubernetes-addons/memory-bandwidth-exporter/Dockerfile b/kubernetes-addons/memory-bandwidth-exporter/Dockerfile new file mode 100644 index 00000000..e69e26c1 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/Dockerfile @@ -0,0 +1,17 @@ +FROM golang:1.22 AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY . /workspace/ +RUN go mod download + +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o memory-bandwidth-exporter cmd/main.go + +FROM ubuntu:22.04 +USER root +WORKDIR / +COPY --from=builder /workspace/memory-bandwidth-exporter . + +ENTRYPOINT ["bash", "-c"] +CMD ["/memory-bandwidth-exporter --collector.node.name=${NODE_NAME} --collector.container.namespaceWhiteList=${NAMESPACE_WHITELIST}"] \ No newline at end of file diff --git a/kubernetes-addons/memory-bandwidth-exporter/Makefile b/kubernetes-addons/memory-bandwidth-exporter/Makefile new file mode 100644 index 00000000..ed66005d --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/Makefile @@ -0,0 +1,75 @@ +GO_CMD := go +DEBUG ?= 0 +DOCKER_REGISTRY ?= docker.io/opea +CONTAINER_TOOL ?= docker +VERSION ?= latest + +ifeq ($(DEBUG),0) + GOFLAGS=-ldflags="all=-s -w" +endif + +MBE_IMG_NAME = memory-bandwidth-exporter:$(VERSION) +MBE_IMG = ${DOCKER_REGISTRY}/${MBE_IMG_NAME} + +build: + @mkdir -p bin + @echo "Building memory-bandwidth-exporter binary..." + $(GO_CMD) build -o bin/memory-bandwidth-exporter $(GOFLAGS) cmd/main.go + +docker.build: + @echo "Building memory-bandwidth-exporter Docker image..." + $(CONTAINER_TOOL) build -t ${MBE_IMG} -f Dockerfile . + +docker.push: + @echo "Push memory-bandwidth-exporter Docker image..." + $(CONTAINER_TOOL) push ${MBE_IMG} + +clean: + @echo "Cleaning up..." + rm -rf bin + +change_img: + sed -i "s\MBE_IMG\${MBE_IMG}\g" config/manifests/memory-bandwidth-exporter.yaml + +test: + @echo "Running tests..." + $(GO_CMD) test ./... + +lint: golangci-lint ## Run golangci-lint linter & yamllint + @echo "Running linters...${GOLANGCI_LINT}" + $(GOLANGCI_LINT) run ./... + +lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes + $(GOLANGCI_LINT) run --fix ./... + +##@ Dependencies + +## Location to install dependencies to +LOCALBIN ?= $(shell pwd)/bin +$(LOCALBIN): + mkdir -p $(LOCALBIN) + +## Tool Binaries +GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION) + +## Tool Versions +GOLANGCI_LINT_VERSION ?= v1.59.1 + +.PHONY: golangci-lint +golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. +$(GOLANGCI_LINT): $(LOCALBIN) + $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,${GOLANGCI_LINT_VERSION}) + +# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist +# $1 - target path with name of binary (ideally with version) +# $2 - package url which can be installed +# $3 - specific version of package +define go-install-tool +@[ -f $(1) ] || { \ +set -e; \ +package=$(2)@$(3) ;\ +echo "Downloading $${package}" ;\ +GOBIN=$(LOCALBIN) go install $${package} ;\ +mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\ +} +endef \ No newline at end of file diff --git a/kubernetes-addons/memory-bandwidth-exporter/README.md b/kubernetes-addons/memory-bandwidth-exporter/README.md new file mode 100644 index 00000000..d2cc1fe0 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/README.md @@ -0,0 +1,151 @@ +# memory bandwidth exporter + +Pod/container grained memory bandwidth exporter provides users memory bandwidth metrics of their running containers. The metrics include llc_occupancy, mbm_local_bytes, mbm_total_bytes, cpu utilization and memory usage, and the metrics have been processed. In addition to container-level metrics, it also provides class-level and socket-level metrics. Users can configure the list of metrics to be collected. It serves as an exporter which can be connected to Promethus-like obserbility tools. And it also can be used as a telementry provider. + +Memory bandwidth exporter makes use of state-of-the-art technologies like NRI to build a resource-efficient and well-maintained solution. This solution provides observability to memory bandwidth to OPEA micro-services. It lays the groundwork of better scaling and auto scaling of OPEA. It can also be deployed separately on end user environments, supporting any cases that memory bandwidth metrics are required. + +The memory bandwidth exporter currently only supports Intel platforms with RDT, and will fail on other platforms. We will add node feature discovery in the future. + +## Setup + +### Enable NRI in Containerd + +```sh +# download containerd binary, containerd version v1.7.0 or higher is required +$ wget https://github.com/containerd/containerd/releases/download/v1.7.0/containerd-1.7.0-linux-amd64.tar.gz + +# stop running containerd +$ sudo systemctl stop containerd + +# replace old containerd +$ sudo tar Cxzvf /usr/local containerd-1.7.0-linux-amd64.tar.gz + +# enable NRI in containerd +# add an item in /etc/containerd/config.toml +[plugins."io.containerd.nri.v1.nri"] + disable = false + disable_connections = false + plugin_config_path = "/etc/containerd/certs.d" + plugin_path = "/opt/nri/plugins" + socket_path = "/var/run/nri/nri.sock" + config_file = "/etc/nri/nri.conf" + +# restart containerd +$ sudo systemctl start containerd +$ sudo systemctl status containerd + +# test nri +$ git clone https://github.com/containerd/nri +$ cd nri +$ make +$ ./build/bin/logger -idx 00 +``` + +### Enable RDT + +Mount resctrl to the directory `/sys/fs/resctrl`: + +```sh +$ sudo mount -t resctrl resctrl /sys/fs/resctrl +``` + +### Setup memory bandwidth exporter + +Before setup, you need to configure the runc hook: + +```sh +$ ./config/config.sh +``` + +#### How to build the binary and setup? + +```sh +$ make build +$ sudo ./bin/memory-bandwidth-exporter +# e.g., sudo ./bin/memory-bandwidth-exporter --collector.node.name= --collector.container.namespaceWhiteList="calico-apiserver,calico-system,kube-system,tigera-operator" + +# get memory bandwidth metrics +$ curl http://localhost:9100/metrics +``` + +#### How to build the docker image and setup? + +```sh +$ make docker.build +$ sudo docker run \ + -e NODE_NAME= \ + -e NAMESPACE_WHITELIST="calico-apiserver,calico-system,kube-system,tigera-operator" \ + --mount type=bind,source=/etc/containers/oci/hooks.d/,target=/etc/containers/oci/hooks.d/ \ + --privileged \ + --cgroupns=host \ + --pid=host \ + --mount type=bind,source=/usr/,target=/usr/ \ + --mount type=bind,source=/sys/fs/resctrl/,target=/sys/fs/resctrl/ \ + --mount type=bind,source=/var/run/nri/,target=/var/run/nri/ \ + -d -p 9100:9100 \ + --name=memory-bandwidth-exporter \ + opea/memory-bandwidth-exporter:latest + +# get memory bandwidth metrics +$ curl http://localhost:9100/metrics +``` + +#### How to deploy on the K8s cluster? + +Build and push your image to the location specified by `MBE_IMG`, and apply manifest: + +```sh +$ make docker.build docker.push MBE_IMG=/opea/memory-bandwidth-exporter: +$ make change_img MBE_IMG=/opea/memory-bandwidth-exporter: +# If namespace system does not exist, create it. +$ kubectl create ns system +$ kubectl apply -f config/manifests/memory-bandwidth-exporter.yaml +``` + +Check the installation result: + +```sh +kubectl get pods -n system +NAME READY STATUS RESTARTS AGE +memory-bandwidth-exporter-zxhdl 1/1 Running 0 3m +``` + +get memory bandwidth metrics + +```sh +$ curl http://:9100/metrics +``` + +#### How to delete binary? + +```sh +$ make clean +``` + +## More flags about memory bandwidth exporter + +There are some flags to help users better use memory bandwidth exporter: + +```sh +-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man). +--collector.node.name="" Give node name. +--collector.container.namespaceWhiteList="" Filter out containers whose namespaces belong to the namespace whitelist, namespaces separated by commas, like "xx,xx,xx". +--collector.container.monTimes=10 Scan the pids of containers created before the exporter starts to prevent the loss of pids. +--collector.container.metrics="all" Enable container collector metrics. +--collector.class.metrics="none" Enable class collector metrics. +--collector.node.metrics="none" Enable node collector metrics. +--web.telemetry-path="/metrics" Path under which to expose metrics. +--[no-]web.disable-exporter-metrics Exclude metrics about the exporter itself (promhttp_*, process_*, go_*). +--web.max-requests=40 Maximum number of parallel scrape requests. Use 0 to disable. +--runtime.gomaxprocs=1 The target number of CPUs Go will run on (GOMAXPROCS) ($GOMAXPROCS) +--[no-]web.systemd-socket Use systemd socket activation listeners instead of port listeners (Linux only). +--web.listen-address=:9100 ... Addresses on which to expose metrics and web interface. Repeatable for multiple addresses. +--web.config.file="" Path to configuration file that can enable TLS or authentication. See: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md +--collector.interval=3s memory bandwidth exporter collect metrics interval +--NRIplugin.name="mb-nri-plugin" Plugin name to register to NRI +--NRIplugin.idx="11" Plugin index to register to NRI +--[no-]disableWatch Disable watching hook directories for new hooks +--log.level=info Only log messages with the given severity or above. One of: [debug, info, warn, error] +--log.format=logfmt Output format of log messages. One of: [logfmt, json] +--[no-]version Show application version. +``` diff --git a/kubernetes-addons/memory-bandwidth-exporter/cmd/main.go b/kubernetes-addons/memory-bandwidth-exporter/cmd/main.go new file mode 100644 index 00000000..9334d19d --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/cmd/main.go @@ -0,0 +1,242 @@ +package main + +import ( + "fmt" + stdlog "log" + "net/http" + "os" + "os/user" + "runtime" + "sort" + + "github.com/alecthomas/kingpin/v2" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/opea-project/GenAIInfra/kubernetes-addons/memory-bandwidth-exporter/collector" + "github.com/opea-project/GenAIInfra/kubernetes-addons/memory-bandwidth-exporter/plugin" + "github.com/prometheus/client_golang/prometheus" + promcollectors "github.com/prometheus/client_golang/prometheus/collectors" + versioncollector "github.com/prometheus/client_golang/prometheus/collectors/version" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/promlog" + "github.com/prometheus/common/promlog/flag" + "github.com/prometheus/common/version" + "github.com/prometheus/exporter-toolkit/web" + "github.com/prometheus/exporter-toolkit/web/kingpinflag" +) + +var ( + metricsPath = kingpin.Flag( + "web.telemetry-path", + "Path under which to expose metrics.", + ).Default("/metrics").String() + disableExporterMetrics = kingpin.Flag( + "web.disable-exporter-metrics", + "Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).", + ).Default("true").Bool() + maxRequests = kingpin.Flag( + "web.max-requests", + "Maximum number of parallel scrape requests. Use 0 to disable.", + ).Default("40").Int() + maxProcs = kingpin.Flag( + "runtime.gomaxprocs", + "The target number of CPUs Go will run on (GOMAXPROCS)", + ).Envar("GOMAXPROCS").Default("1").Int() + toolkitFlags = kingpinflag.AddFlags(kingpin.CommandLine, ":9100") + interval = kingpin.Flag( + "collector.interval", + "memory bandwidth exporter collect metrics interval", + ).Default("3s").Duration() + pluginName = kingpin.Flag( + "NRIplugin.name", + "Plugin name to register to NRI", + ).Default("mb-nri-plugin").String() + pluginIdx = kingpin.Flag( + "NRIplugin.idx", + "Plugin index to register to NRI", + ).Default("11").String() + disableWatch = kingpin.Flag( + "disableWatch", + "Disable watching hook directories for new hooks", + ).Default("false").Bool() +) + +type handler struct { + unfilteredHandler http.Handler + // exporterMetricsRegistry is a separate registry for the metrics about + // the exporter itself. + exporterMetricsRegistry *prometheus.Registry + includeExporterMetrics bool + maxRequests int + logger log.Logger +} + +func newHandler(includeExporterMetrics bool, maxRequests int, logger log.Logger) *handler { + h := &handler{ + exporterMetricsRegistry: prometheus.NewRegistry(), + includeExporterMetrics: includeExporterMetrics, + maxRequests: maxRequests, + logger: logger, + } + if h.includeExporterMetrics { + h.exporterMetricsRegistry.MustRegister( + promcollectors.NewProcessCollector(promcollectors.ProcessCollectorOpts{}), + promcollectors.NewGoCollector(), + ) + } + if innerHandler, err := h.innerHandler(); err != nil { + panic(fmt.Sprintf("Couldn't create metrics handler: %s", err)) + } else { + h.unfilteredHandler = innerHandler + } + return h +} + +// ServeHTTP implements http.Handler. +func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + filters := r.URL.Query()["collect[]"] + level.Debug(h.logger).Log("msg", "collect query:", "filters", filters) + + if len(filters) == 0 { + // No filters, use the prepared unfiltered handler. + h.unfilteredHandler.ServeHTTP(w, r) + return + } + // To serve filtered metrics, we create a filtering handler on the fly. + filteredHandler, err := h.innerHandler(filters...) + if err != nil { + level.Warn(h.logger).Log("msg", "Couldn't create filtered metrics handler:", "err", err) + w.WriteHeader(http.StatusBadRequest) + w.Write([]byte(fmt.Sprintf("Couldn't create filtered metrics handler: %s", err))) + return + } + filteredHandler.ServeHTTP(w, r) +} + +// innerHandler is used to create both the one unfiltered http.Handler to be +// wrapped by the outer handler and also the filtered handlers created on the +// fly. The former is accomplished by calling innerHandler without any arguments +// (in which case it will log all the collectors enabled via command-line +// flags). +func (h *handler) innerHandler(filters ...string) (http.Handler, error) { + level.Info(h.logger).Log("msg", "filters", filters) + nc, err := collector.NewCollector(h.logger, *interval, filters...) + if err != nil { + return nil, fmt.Errorf("couldn't create collector: %s", err) + } + + // Only log the creation of an unfiltered handler, which should happen + // only once upon startup. + if len(filters) == 0 { + level.Info(h.logger).Log("msg", "Enabled collectors") + collectors := []string{} + for n := range nc.Collectors { + collectors = append(collectors, n) + } + sort.Strings(collectors) + for _, c := range collectors { + level.Info(h.logger).Log("collector", c) + } + } + + r := prometheus.NewRegistry() + r.MustRegister(versioncollector.NewCollector("memory_bandwidth_exporter")) + if err := r.Register(nc); err != nil { + return nil, fmt.Errorf("couldn't register collector: %s", err) + } + + var handler http.Handler + if h.includeExporterMetrics { + handler = promhttp.HandlerFor( + prometheus.Gatherers{h.exporterMetricsRegistry, r}, + promhttp.HandlerOpts{ + ErrorLog: stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0), + ErrorHandling: promhttp.ContinueOnError, + MaxRequestsInFlight: h.maxRequests, + Registry: h.exporterMetricsRegistry, + }, + ) + // Note that we have to use h.exporterMetricsRegistry here to + // use the same promhttp metrics for all expositions. + handler = promhttp.InstrumentMetricHandler( + h.exporterMetricsRegistry, handler, + ) + } else { + handler = promhttp.HandlerFor( + r, + promhttp.HandlerOpts{ + ErrorLog: stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0), + ErrorHandling: promhttp.ContinueOnError, + MaxRequestsInFlight: h.maxRequests, + }, + ) + } + + return handler, nil +} + +// The memory bandwidth exporter currently only supports Intel platforms with RDT, and will fail on other platforms. +// We will add node feature discovery in the future. +func main() { + promlogConfig := &promlog.Config{} + flag.AddFlags(kingpin.CommandLine, promlogConfig) + version.Version = "v0.1.0" + kingpin.Version(version.Print("memory bandwidth exporter")) + kingpin.CommandLine.UsageWriter(os.Stdout) + kingpin.HelpFlag.Short('h') + kingpin.Parse() + logger := promlog.New(promlogConfig) + + level.Info(logger).Log("msg", "Starting memory bandwidth exporter", "version", version.Info()) + level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext()) + if user, err := user.Current(); err == nil && user.Uid == "0" { + level.Warn(logger).Log("msg", `Memory bandwidth exporter is running as root user. This + exporter is designed to run as unprivileged user, root is not required.`) + } + runtime.GOMAXPROCS(*maxProcs) + level.Info(logger).Log("msg", "Go MAXPROCS", "procs", runtime.GOMAXPROCS(0)) + isNeedMakeMonitorGroup := collector.ParseCollectorMetrics() + nriPlugin := &plugin.Plugin{ + PluginName: *pluginName, + PluginIdx: *pluginIdx, + DisableWatch: *disableWatch, + Logger: logger, + } + level.Info(logger).Log("msg", "Starting NRI plugin") + errChan := make(chan error) + go func() { + errChan <- nriPlugin.Run(isNeedMakeMonitorGroup) + }() + go func() { + if err := <-errChan; err != nil { + level.Error(logger).Log("Failed to run nriPlugin", "error", err) + } + }() + + http.Handle(*metricsPath, newHandler(!*disableExporterMetrics, *maxRequests, logger)) + if *metricsPath != "/" { + landingConfig := web.LandingConfig{ + Name: "Memory Bandwidth Exporter", + Description: "Prometheus Memory Bandwidth Exporter", + Version: version.Info(), + Links: []web.LandingLinks{ + { + Address: *metricsPath, + Text: "Metrics", + }, + }, + } + landingPage, err := web.NewLandingPage(landingConfig) + if err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } + http.Handle("/", landingPage) + } + + server := &http.Server{} + if err := web.ListenAndServe(server, toolkitFlags, logger); err != nil { + level.Error(logger).Log("err", err) + os.Exit(1) + } +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/class_collector.go b/kubernetes-addons/memory-bandwidth-exporter/collector/class_collector.go new file mode 100644 index 00000000..c3b14746 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/class_collector.go @@ -0,0 +1,191 @@ +package collector + +import ( + "os" + "path/filepath" + "strings" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + classCollectorSubsystem = "rdtClass" +) + +type classCollector struct { + statsCache map[string]*stats + interval time.Duration + logger log.Logger + nodeName string + metrics map[string]struct{} +} + +func init() { + registerCollector(classCollectorSubsystem, defaultDisabled, NewClassCollector) +} + +// NewClassCollector returns a new Collector exposing class level memory bandwidth metrics. +func NewClassCollector(logger log.Logger, interval time.Duration) (Collector, error) { + c := &classCollector{ + statsCache: make(map[string]*stats), + interval: interval, + logger: logger, + nodeName: *nodeName, + metrics: make(map[string]struct{}), + } + logger.Log("info", "new class collector", "metrics", *classCollectorMetrics) + if *classCollectorMetrics == allMetrics { + for _, m := range allClassMetrics { + c.metrics[m] = struct{}{} + } + } else if *classCollectorMetrics != noMetrics { + for _, m := range strings.Split(*classCollectorMetrics, ",") { + c.metrics[m] = struct{}{} + } + } + c.Start() + return c, nil +} + +func (c *classCollector) Start() { + c.logger.Log("info", "start class collector", "metrics", getMetricsKeys(c.metrics)) + if isNeedCollectMbLLc(c.metrics) { + go func() { + for { + err := c.updateClasses() + if err != nil { + c.logger.Log("error", "class collector update classes failed", "err", err) + } + time.Sleep(jitter(c.interval)) + } + }() + } + go func() { + for { + err := c.updateStats() + if err != nil { + c.logger.Log("error", "class collector update stats failed", "err", err) + } + time.Sleep(jitter(c.interval)) + } + }() +} + +func (c *classCollector) updateClasses() error { + excludeDirs := map[string]bool{ + "info": true, + "mon_data": true, + "mon_groups": true, + } + files, err := os.ReadDir(rootResctrlPath) + if err != nil { + return err + } + for _, file := range files { + if file.IsDir() { + dirName := file.Name() + _, ok := c.statsCache[dirName] + if !excludeDirs[dirName] && !ok { + c.statsCache[dirName] = nil + } + } + } + return nil +} + +func (c *classCollector) updateStats() error { + for class := range c.statsCache { + newStats := RawStats{} + var err error + if isNeedCollectMbLLc(c.metrics) { + newStats.SocketNum, newStats.MemoryBandwidth, newStats.Cache, err = + getIntelRDTStatsFrom(filepath.Join(rootResctrlPath, class)) + if err != nil { + return err + } + } + if c.statsCache[class] != nil { + pStats, err := processStats(c.statsCache[class].oldStats, newStats) + if err != nil { + return err + } + c.statsCache[class] = &stats{ + oldStats: newStats, + processedStats: pStats, + } + } else { + c.statsCache[class] = &stats{ + oldStats: newStats, + processedStats: ProcessedStats{}, + } + } + } + return nil +} + +func (c *classCollector) Update(ch chan<- prometheus.Metric) error { + if len(c.statsCache) == 0 { + c.logger.Log("info", "class collector stats have no cache") + return nil + } + if !isNeedCollectMbLLc(c.metrics) { + return nil + } + // cid is the container id + for cid, stats := range c.statsCache { + ch <- prometheus.MustNewConstMetric( + classTotalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.processedStats.SumMemoryBandwidth.TotalMBps, + cid, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + classLocalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.processedStats.SumMemoryBandwidth.LocalMBps, + cid, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + classLLCacheDesc, + prometheus.GaugeValue, + stats.processedStats.SumCache.LLCOccupancy, + cid, + c.nodeName, + ) + // sid is the socket id + for sid, s := range stats.processedStats.MemoryBandwidth { + ch <- prometheus.MustNewConstMetric( + socketClassTotalMemoryBandwidthDesc, + prometheus.GaugeValue, + s.TotalMBps, + sid, + cid, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + socketClassLocalMemoryBandwidthDesc, + prometheus.GaugeValue, + s.LocalMBps, + sid, + cid, + c.nodeName, + ) + } + // sid is the socket id + for sid, s := range stats.processedStats.Cache { + ch <- prometheus.MustNewConstMetric( + socketClassLLCacheDesc, + prometheus.GaugeValue, + s.LLCOccupancy, + sid, + cid, + c.nodeName, + ) + } + } + return nil +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/collector.go b/kubernetes-addons/memory-bandwidth-exporter/collector/collector.go new file mode 100644 index 00000000..83b7d7f3 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/collector.go @@ -0,0 +1,177 @@ +package collector + +import ( + "errors" + "fmt" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" +) + +// Namespace defines the common namespace to be used by all metrics. +const namespace = "rdt" + +var ( + scrapeSuccessDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "scrape", "collector_success"), + "memory_bandwidth_exporter: Whether a collector succeeded.", + []string{"collector"}, + nil, + ) +) + +const ( + defaultEnabled = true + defaultDisabled = false +) + +var ( + factories = make(map[string]func(logger log.Logger, interval time.Duration) (Collector, error)) + initiatedCollectorsMtx = sync.Mutex{} + initiatedCollectors = make(map[string]Collector) + collectorState = make(map[string]*bool) + collectorMetrics = make(map[string]string) +) + +func registerCollector(collector string, isDefaultEnabled bool, factory func( + logger log.Logger, interval time.Duration) (Collector, error)) { + collectorState[collector] = &isDefaultEnabled + factories[collector] = factory +} + +func ParseCollectorMetrics() bool { + isNeedNRIPlugin := false + for collector := range collectorState { + if collector == containerCollectorSubsystem { + var isDefaultEnabled bool + if *containerCollectorMetrics == noMetrics { + isDefaultEnabled = false + } else { + isDefaultEnabled = true + } + collectorState[collector] = &isDefaultEnabled + collectorMetrics[collector] = *containerCollectorMetrics + if *containerCollectorMetrics == allMetrics || strings.Contains(*containerCollectorMetrics, "mb") || + strings.Contains(*containerCollectorMetrics, "llc") { + isNeedNRIPlugin = true + } + } + if collector == classCollectorSubsystem { + var isDefaultEnabled bool + if *classCollectorMetrics == "none" { + isDefaultEnabled = false + } else { + isDefaultEnabled = true + } + collectorState[collector] = &isDefaultEnabled + collectorMetrics[collector] = *classCollectorMetrics + } + if collector == nodeCollectorSubsystem { + var isDefaultEnabled bool + if *nodeCollectorMetrics == "none" { + isDefaultEnabled = false + } else { + isDefaultEnabled = true + } + collectorState[collector] = &isDefaultEnabled + collectorMetrics[collector] = *nodeCollectorMetrics + } + } + return isNeedNRIPlugin +} + +// Collector implements the prometheus.Collector interface. +type MBCollector struct { + Collectors map[string]Collector + logger log.Logger +} + +// NewCollector creates a new Collector. +func NewCollector(logger log.Logger, interval time.Duration, filters ...string) (*MBCollector, error) { + f := make(map[string]bool) + for _, filter := range filters { + enabled, exist := collectorState[filter] + if !exist { + return nil, fmt.Errorf("missing collector: %s", filter) + } + if !*enabled { + return nil, fmt.Errorf("disabled collector: %s", filter) + } + f[filter] = true + } + collectors := make(map[string]Collector) + initiatedCollectorsMtx.Lock() + defer initiatedCollectorsMtx.Unlock() + for key, enabled := range collectorState { + if !*enabled || (len(f) > 0 && !f[key]) { + continue + } + if collector, ok := initiatedCollectors[key]; ok { + collectors[key] = collector + } else { + collector, err := factories[key](log.With(logger, "collector", key), interval) + if err != nil { + return nil, err + } + collectors[key] = collector + initiatedCollectors[key] = collector + } + } + return &MBCollector{Collectors: collectors, logger: logger}, nil +} + +// Describe implements the prometheus.Collector interface. +func (n MBCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- scrapeSuccessDesc +} + +// Collect implements the prometheus.Collector interface. +func (n MBCollector) Collect(ch chan<- prometheus.Metric) { + wg := sync.WaitGroup{} + wg.Add(len(n.Collectors)) + for name, c := range n.Collectors { + go func(name string, c Collector) { + execute(name, c, ch, n.logger) + wg.Done() + }(name, c) + } + wg.Wait() +} + +func execute(name string, c Collector, ch chan<- prometheus.Metric, logger log.Logger) { + begin := time.Now() + err := c.Update(ch) + duration := time.Since(begin) + var success float64 + + if err != nil { + if IsNoDataError(err) { + level.Debug(logger).Log("msg", "collector returned no data", "name", name, "duration_seconds", duration.Seconds(), + "err", err) + } else { + level.Error(logger).Log("msg", "collector failed", "name", name, "duration_seconds", duration.Seconds(), "err", err) + } + success = 0 + } else { + level.Debug(logger).Log("msg", "collector succeeded", "name", name, "duration_seconds", duration.Seconds()) + success = 1 + } + ch <- prometheus.MustNewConstMetric(scrapeSuccessDesc, prometheus.GaugeValue, success, name) +} + +// Collector is the interface a collector has to implement. +type Collector interface { + // Get new metrics and expose them via prometheus registry. + Update(ch chan<- prometheus.Metric) error +} + +// ErrNoData indicates the collector found no data to collect, but had no other error. +var ErrNoData = errors.New("collector returned no data") + +func IsNoDataError(err error) bool { + return err == ErrNoData +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/common.go b/kubernetes-addons/memory-bandwidth-exporter/collector/common.go new file mode 100644 index 00000000..37f92509 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/common.go @@ -0,0 +1,428 @@ +package collector + +import ( + "bufio" + "bytes" + "fmt" + "math/rand" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/alecthomas/kingpin/v2" +) + +const ( + monDataDirName = "mon_data" + llcOccupancyFileName = "llc_occupancy" + mbmLocalBytesFileName = "mbm_local_bytes" + mbmTotalBytesFileName = "mbm_total_bytes" + unavailable = "Unavailable" + rootResctrlPath = "/sys/fs/resctrl" + cgroupControllerPath = "/sys/fs/cgroup/cgroup.controllers" + fmtTime = "2006-01-02 15:04:05" + allMetrics = "all" + noMetrics = "none" +) + +var ( + nodeName = kingpin.Flag( + "collector.node.name", + "Give node name.", + ).Default("").String() + namespaceWhiteList = kingpin.Flag( + "collector.container.namespaceWhiteList", + `Filter out containers whose namespaces belong to the namespace whitelist, + namespaces separated by commas, like \"xx,yy,zz\".`, + ).Default("").String() + monTimes = kingpin.Flag( + "collector.container.monTimes", + "Scan the pids of containers created before the exporter starts to prevent the loss of pids.", + ).Default("10").Int() + containerCollectorMetrics = kingpin.Flag( + "collector.container.metrics", + "Enable container collector metrics", + ).Default("all").String() + classCollectorMetrics = kingpin.Flag( + "collector.class.metrics", + "Enable class collector metrics", + ).Default("none").String() + nodeCollectorMetrics = kingpin.Flag( + "collector.node.metrics", + "Enable node collector metrics", + ).Default("none").String() + allClassMetrics = []string{"mb", "llc"} + allNodeMetrics = []string{"mb", "llc"} + allContainerMetrics = []string{"mb", "llc", "cpu", "memory"} +) + +func isNeedCollectMbLLc(metrics map[string]struct{}) bool { + _, ok1 := metrics["mb"] + _, ok2 := metrics["llc"] + return ok1 || ok2 +} + +func isNeedCollectCpu(metrics map[string]struct{}) bool { + _, ok := metrics["cpu"] + return ok +} + +func isNeedCollectMemory(metrics map[string]struct{}) bool { + _, ok := metrics["memory"] + return ok +} + +func getMetricsKeys(m map[string]struct{}) string { + keys := make([]string, 0, len(m)) + for key := range m { + keys = append(keys, key) + } + return strings.Join(keys, ",") +} + +func jitter(duration time.Duration) time.Duration { + const maxFactor = 0.1 + wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration)) + return wait +} + +// path: mon_groups path +func getIntelRDTStatsFrom(path string) (int, map[string]RawMemoryBandwidthStats, map[string]RawCacheStats, error) { + _, err := os.Stat(path) + if os.IsNotExist(err) { + return 0, nil, nil, fmt.Errorf("mon_groups path %q does not exist", path) + } + statsDirectories, err := filepath.Glob(filepath.Join(path, monDataDirName, "*")) + if err != nil { + return 0, nil, nil, err + } + + if len(statsDirectories) == 0 { + return 0, nil, nil, fmt.Errorf("there is no mon_data stats directories: %q", path) + } + + cmtStats := make(map[string]RawCacheStats, 0) + mbmStats := make(map[string]RawMemoryBandwidthStats, 0) + + socketNum := len(statsDirectories) + for _, dir := range statsDirectories { + dirParts := strings.Split(dir, "_") + nid := dirParts[len(dirParts)-1] + + llcOccupancy, _, err := readStatFrom(filepath.Join(dir, llcOccupancyFileName)) + if err != nil { + return socketNum, nil, nil, err + } + cmtStats[nid] = RawCacheStats{ + LLCOccupancy: llcOccupancy, + } + + totalBytes, tBtime, err := readStatFrom(filepath.Join(dir, mbmTotalBytesFileName)) + if err != nil { + return socketNum, nil, nil, err + } + localBytes, lBtime, err := readStatFrom(filepath.Join(dir, mbmLocalBytesFileName)) + if err != nil { + return socketNum, nil, nil, err + } + mbmStats[nid] = RawMemoryBandwidthStats{ + TotalBytes: totalBytes, + TotalBytesTimeStamp: tBtime, + LocalBytes: localBytes, + LocalBytesTimeStamp: lBtime, + } + } + return socketNum, mbmStats, cmtStats, nil +} + +// path: cgroupPath +func getCPUUtilizationFrom(path string) (RawCPUStats, error) { + cgroupVersion := getCgroupVersion() + var err error + stat := RawCPUStats{ + TimeStamp: time.Now().Format(fmtTime), + } + if cgroupVersion == "v1" { + stat.CPU, err = getCgroupV1CpuTime(path) + if err != nil { + return stat, err + } + } else { + stat.CPU, err = getCgroupV2CpuTime(path) + if err != nil { + return stat, err + } + } + + return stat, nil +} + +// path: cgroupPath +func getMemorySizeFrom(path string) (int64, error) { + cgroupVersion := getCgroupVersion() + var err error + var filePath string + if cgroupVersion == "v1" { + filePath = filepath.Join(path, "memory.usage_in_bytes") + } + if cgroupVersion == "v2" { + filePath = filepath.Join(path, "memory.current") + } + _, err = os.Stat(filePath) + if err != nil { + return 0, err + } + content, err := os.ReadFile(filePath) + + if err != nil { + return 0, err + } + memory, err := strconv.ParseInt(strings.TrimSpace(string(content)), 10, 64) + return memory, err +} + +func getCgroupVersion() string { + _, err := os.Stat(cgroupControllerPath) + if err == nil { + return "v2" + } else { + return "v1" + } +} + +func getCgroupV1CpuTime(cgroupPath string) (int64, error) { + filePath := filepath.Join(cgroupPath, "cpuacct.usage") + _, err := os.Stat(filePath) + if err != nil { + return 0, err + } + content, err := os.ReadFile(filePath) + if err != nil { + return 0, err + } + cpuUsage, err := strconv.ParseInt(strings.TrimSpace(string(content)), 10, 64) + return cpuUsage, err +} + +// The CPU time obtained is in microseconds. +func getCgroupV2CpuTime(cgroupPath string) (int64, error) { + filePath := filepath.Join(cgroupPath, "cpu.stat") + _, err := os.Stat(filePath) + if err != nil { + return 0, err + } + file, err := os.Open(filePath) + if err != nil { + return 0, err + } + defer func() { + if cerr := file.Close(); cerr != nil { + err = cerr + } + }() + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + columns := strings.Split(line, " ") + if columns[0] == "usage_usec" { + cpuUsage, err := strconv.ParseInt(strings.TrimSpace(columns[1]), 10, 64) + return cpuUsage, err + } + } + return 0, err +} + +// bytesToMiB converts bytes to MiB +func bytesToMiB(bytes uint64) float64 { + return float64(bytes) / (1024 * 1024) +} + +// bytesToMB converts bytes to MB +func bytesToMB(bytes uint64) float64 { + return float64(bytes) / (1000 * 1000) +} + +func readStatFrom(path string) (uint64, string, error) { + context, err := os.ReadFile(path) + now := time.Now().Format(fmtTime) + if err != nil { + return 0, now, err + } + + contextString := string(bytes.TrimSpace(context)) + + if contextString == unavailable { + err := fmt.Errorf("\"Unavailable\" value from file %q", path) + return 0, now, err + } + + stat, err := strconv.ParseUint(contextString, 10, 64) + if err != nil { + return stat, now, fmt.Errorf("unable to parse %q as a uint from file %q", string(context), path) + } + + return stat, now, nil +} + +func processStats(oldStats RawStats, newStats RawStats) (ProcessedStats, error) { + pstats := ProcessedStats{ + socketNum: newStats.SocketNum, + } + var sumCmtStats float64 + var sumMbmTotal float64 + var sumMbmLocal float64 + + if newStats.Cache != nil { + pstats.Cache = make(map[string]ProcessedCacheStats, 0) + for nid, llc := range newStats.Cache { + cmt := bytesToMiB(llc.LLCOccupancy) + sumCmtStats += cmt + pstats.Cache[nid] = ProcessedCacheStats{ + LLCOccupancy: cmt, + } + } + pstats.SumCache = ProcessedCacheStats{ + LLCOccupancy: sumCmtStats, + } + } + if newStats.MemoryBandwidth != nil && oldStats.MemoryBandwidth != nil { + pstats.MemoryBandwidth = make(map[string]ProcessedMemoryBandwidthStats, 0) + for nid, newStat := range newStats.MemoryBandwidth { + oldStat, ok := oldStats.MemoryBandwidth[nid] + if !ok { + return pstats, fmt.Errorf("missing socket %q in oldStats", nid) + } + otTime, err := time.Parse(fmtTime, oldStat.TotalBytesTimeStamp) + if err != nil { + return pstats, err + } + ntTime, err := time.Parse(fmtTime, newStat.TotalBytesTimeStamp) + if err != nil { + return pstats, err + } + olTime, err := time.Parse(fmtTime, oldStat.LocalBytesTimeStamp) + if err != nil { + return pstats, err + } + nlTime, err := time.Parse(fmtTime, newStat.LocalBytesTimeStamp) + if err != nil { + return pstats, err + } + tmbm := bytesToMB(newStat.TotalBytes-oldStat.TotalBytes) / ntTime.Sub(otTime).Seconds() + lmbm := bytesToMB(newStat.LocalBytes-oldStat.LocalBytes) / nlTime.Sub(olTime).Seconds() + sumMbmTotal += tmbm + sumMbmLocal += lmbm + pstats.MemoryBandwidth[nid] = ProcessedMemoryBandwidthStats{ + TotalMBps: tmbm, + LocalMBps: lmbm, + } + } + pstats.SumMemoryBandwidth = ProcessedMemoryBandwidthStats{ + TotalMBps: sumMbmTotal, + LocalMBps: sumMbmLocal, + } + } + if oldStats.CPUUtilization != nil && newStats.CPUUtilization != nil { + ocTime, err := time.Parse(fmtTime, oldStats.CPUUtilization.TimeStamp) + if err != nil { + return pstats, err + } + ncTime, err := time.Parse(fmtTime, newStats.CPUUtilization.TimeStamp) + if err != nil { + return pstats, err + } + pstats.CPUUtilization = float64(newStats.CPUUtilization.CPU-oldStats.CPUUtilization.CPU) / + float64(ncTime.Sub(ocTime).Microseconds()) + } + if newStats.Memory != 0 { + pstats.Memory = float64(newStats.Memory) / 1024 / 1024 + } + return pstats, nil +} + +func makeMonitorGroup(monPath string) error { + info, err := os.Stat(monPath) + if os.IsNotExist(err) { + err := os.Mkdir(monPath, 0755) + if err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + } else if err != nil { + return fmt.Errorf("failed to check directory %v: %v", monPath, err) + } else if !info.IsDir() { + return fmt.Errorf("%s already exists but is not a directory", monPath) + } else { + fmt.Printf("Directory %s already exists\n", monPath) + } + + return nil +} + +func writePidsToTasks(monPath string, cgroupPath string) error { + containerPids, err := readCPUTasks(cgroupPath + "/cgroup.threads") + if err != nil { + return fmt.Errorf("failed to read %v/cgroup.threads: %v", cgroupPath, err) + } + err = writeTaskIDsToFile(containerPids, monPath+"/tasks") + if err != nil { + return fmt.Errorf("failed to write to %v/tasks: %v", monPath, err) + } + return nil +} + +func readCPUTasks(path string) ([]int32, error) { + if path == "" { + return nil, nil + } + + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + tasksStr := strings.Trim(string(data), "\n") + values := make([]int32, 0) + lines := strings.Split(tasksStr, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if len(line) <= 0 { + continue + } + v, err := strconv.ParseInt(line, 10, 32) + if err != nil { + return nil, fmt.Errorf("cannot parse cgroup value of line %s, err: %v", line, err) + } + values = append(values, int32(v)) + } + return values, nil +} + +func writeTaskIDsToFile(pids []int32, filename string) error { + file, err := os.OpenFile(filename, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644) + if err != nil { + return err + } + defer func() { + if cerr := file.Close(); cerr != nil { + err = cerr + } + }() + for _, id := range pids { + _, err := file.WriteString(strconv.FormatInt(int64(id), 10) + "\n") + if err != nil { + return err + } + } + + return nil +} + +func stringInSlice(str string, list []string) bool { + for _, v := range list { + if v == str { + return true + } + } + return false +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/container_collector.go b/kubernetes-addons/memory-bandwidth-exporter/collector/container_collector.go new file mode 100644 index 00000000..18ff2476 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/container_collector.go @@ -0,0 +1,290 @@ +package collector + +import ( + "fmt" + "os" + "strings" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/opea-project/GenAIInfra/kubernetes-addons/memory-bandwidth-exporter/info" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/utils/clock" +) + +const ( + containerCollectorSubsystem = "container" +) + +type containerCollector struct { + statsCache map[string]stats + containerInfos map[string]info.ContainerInfo + interval time.Duration + logger log.Logger + namespaceWhiteList []string + monTimes int + metrics map[string]struct{} +} + +func init() { + registerCollector(containerCollectorSubsystem, defaultEnabled, NewContainerCollector) +} + +// NewContainerCollector returns a new Collector exposing container level memory bandwidth metrics. +func NewContainerCollector(logger log.Logger, interval time.Duration) (Collector, error) { + var ns []string + if *namespaceWhiteList != "" { + ns = strings.Split(*namespaceWhiteList, ",") + } + c := &containerCollector{ + statsCache: make(map[string]stats), + containerInfos: make(map[string]info.ContainerInfo), + interval: interval, + logger: logger, + namespaceWhiteList: ns, + monTimes: *monTimes, + metrics: make(map[string]struct{}), + } + logger.Log("info", "new container collector", "metrics", *containerCollectorMetrics) + if *containerCollectorMetrics == allMetrics { + for _, m := range allContainerMetrics { + c.metrics[m] = struct{}{} + } + } else if *containerCollectorMetrics != noMetrics { + for _, m := range strings.Split(*containerCollectorMetrics, ",") { + c.metrics[m] = struct{}{} + } + } + c.Start() + return c, nil +} + +func (c *containerCollector) Start() { + c.logger.Log("info", "start container collector", "metrics", getMetricsKeys(c.metrics)) + go func() { + for { + select { + case cdata, ok := <-info.ContainerInfoChan: + if !ok { + c.logger.Log("err", "Channel closed, stopping data processing.") + } + err := c.processContainerData(cdata) + if err != nil { + c.logger.Log("err", fmt.Sprintf("Cannot process container data: %v", err)) + } + default: + time.Sleep(50 * time.Millisecond) + } + } + }() +} + +func (c *containerCollector) processContainerData(data map[string]info.ContainerInfo) error { + for containerId, containerInfo := range data { + if len(c.namespaceWhiteList) > 0 && stringInSlice(containerInfo.NameSpace, c.namespaceWhiteList) { + continue + } + if data[containerId].Operation == 0 { + delete(c.statsCache, containerId) + delete(c.containerInfos, containerId) + } + level.Info(c.logger).Log("msg", "ContainerInfoChan received", "operation", data[containerId].Operation, + "pod name", data[containerId].PodName, "container id", containerId, "container name", + data[containerId].ContainerName, "namespace", data[containerId].NameSpace) + if data[containerId].Operation == 1 || data[containerId].Operation == 2 { + c.containerInfos[containerId] = containerInfo + go c.housekeeping(containerId) + } + + if data[containerId].Operation == 2 && isNeedCollectMbLLc(c.metrics) { + err := makeMonitorGroup(c.containerInfos[containerId].MonGroupPath) + if err != nil { + return fmt.Errorf("failed to create monitor group: %v", err) + } + go c.updatePids(containerId) + } + } + return nil +} + +func (c *containerCollector) updatePids(containerId string) { + for i := 0; i < c.monTimes; i++ { + c.logger.Log("debug", fmt.Sprintf(`Scan for the %v time and update the pids of the container + created before the exporter started.`, i+1)) + if _, ok := c.containerInfos[containerId]; !ok { + return + } + if err := writePidsToTasks(c.containerInfos[containerId].MonGroupPath, + c.containerInfos[containerId].CgroupPath); err != nil { + c.logger.Log("err", fmt.Sprintf("failed to update container %v stats: %v", containerId, err)) + } + time.Sleep(jitter(c.interval)) + } +} + +func (c *containerCollector) housekeeping(containerId string) { + clock := clock.RealClock{} + houseKeepingTimer := clock.NewTimer(c.interval) + defer houseKeepingTimer.Stop() + for range houseKeepingTimer.C() { + _, err := os.Stat(c.containerInfos[containerId].CgroupPath) + if os.IsNotExist(err) { + c.logger.Log("info", fmt.Sprintf("container %v cgroup path %v does not exist, deleting cache", + containerId, c.containerInfos[containerId].CgroupPath)) + delete(c.statsCache, containerId) + delete(c.containerInfos, containerId) + return + } + if err != nil { + c.logger.Log("err", fmt.Sprintf("failed to stat cgroup path %v: %v", c.containerInfos[containerId].CgroupPath, err)) + return + } + if err := c.updateStats(containerId); err != nil { + c.logger.Log("err", fmt.Sprintf("failed to update container %v stats: %v", containerId, err)) + return + } + houseKeepingTimer.Reset(jitter(c.interval)) + } +} + +func (c *containerCollector) updateStats(containerId string) error { + newStats := RawStats{} + var err error + if isNeedCollectMbLLc(c.metrics) { + newStats.SocketNum, newStats.MemoryBandwidth, newStats.Cache, err = + getIntelRDTStatsFrom(c.containerInfos[containerId].MonGroupPath) + if err != nil { + return err + } + } + if isNeedCollectCpu(c.metrics) { + cpuUtilization, err := getCPUUtilizationFrom(c.containerInfos[containerId].CgroupPath) + if err != nil { + return err + } + newStats.CPUUtilization = &cpuUtilization + } + if isNeedCollectMemory(c.metrics) { + newStats.Memory, err = getMemorySizeFrom(c.containerInfos[containerId].CgroupPath) + if err != nil { + return err + } + } + if oldStats, ok := c.statsCache[containerId]; ok { + pStats, err := processStats(oldStats.oldStats, newStats) + if err != nil { + return err + } + c.statsCache[containerId] = stats{ + oldStats: newStats, + processedStats: pStats, + } + } else { + c.statsCache[containerId] = stats{ + oldStats: newStats, + processedStats: ProcessedStats{}, + } + } + return nil +} + +func (c *containerCollector) Update(ch chan<- prometheus.Metric) error { + if len(c.statsCache) == 0 { + c.logger.Log("info", "container collector stats have no cache") + return nil + } + // cid is the container id + for cid, stats := range c.statsCache { + if isNeedCollectMbLLc(c.metrics) && stats.processedStats.MemoryBandwidth != nil { + ch <- prometheus.MustNewConstMetric( + sumTotalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.processedStats.SumMemoryBandwidth.TotalMBps, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + ch <- prometheus.MustNewConstMetric( + sumLocalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.processedStats.SumMemoryBandwidth.LocalMBps, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + // sid is the socket id + for sid, s := range stats.processedStats.MemoryBandwidth { + ch <- prometheus.MustNewConstMetric( + totalMemoryBandwidthDesc, + prometheus.GaugeValue, + s.TotalMBps, + sid, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + ch <- prometheus.MustNewConstMetric( + localMemoryBandwidthDesc, + prometheus.GaugeValue, + s.LocalMBps, + sid, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + } + } + if isNeedCollectMbLLc(c.metrics) && stats.processedStats.Cache != nil { + ch <- prometheus.MustNewConstMetric( + sumLLCacheDesc, + prometheus.GaugeValue, + stats.processedStats.SumCache.LLCOccupancy, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + // sid is the socket id + for sid, s := range stats.processedStats.Cache { + ch <- prometheus.MustNewConstMetric( + llcacheDesc, + prometheus.GaugeValue, + s.LLCOccupancy, + sid, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + } + } + if isNeedCollectCpu(c.metrics) { + ch <- prometheus.MustNewConstMetric( + cpuUtilizationDesc, + prometheus.GaugeValue, + stats.processedStats.CPUUtilization, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + } + if isNeedCollectMemory(c.metrics) { + ch <- prometheus.MustNewConstMetric( + memoryDesc, + prometheus.GaugeValue, + stats.processedStats.Memory, + cid, + c.containerInfos[cid].ContainerName, + c.containerInfos[cid].PodName, + c.containerInfos[cid].NameSpace, + ) + } + } + return nil +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/info.go b/kubernetes-addons/memory-bandwidth-exporter/collector/info.go new file mode 100644 index 00000000..7d5eb614 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/info.go @@ -0,0 +1,166 @@ +package collector + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + // container + sumTotalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "sum_total_memory_bandwidth"), + "The sum of total memory bandwidth for all sockets in MBps.", + []string{"containerId", "containerName", "podName", "nameSpace"}, nil, + ) + sumLocalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "sum_local_memory_bandwidth"), + "The sum of local memory bandwidth for all sockets in MBps.", + []string{"containerId", "containerName", "podName", "nameSpace"}, nil, + ) + totalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "total_memory_bandwidth"), + "One socket total memory bandwidth in MBps.", + []string{"socketId", "containerId", "containerName", "podName", "nameSpace"}, nil, + ) + localMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "local_memory_bandwidth"), + "One socket local memory bandwidth in MBps.", + []string{"socketId", "containerId", "containerName", "podName", "nameSpace"}, nil, + ) + sumLLCacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "sum_llc_occupancy"), + "The sum of llc occupancy for all sockets in MiB.", + []string{"containerId", "containerName", "podName", "nameSpace"}, nil, + ) + llcacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "llc_occupancy"), + "One socket llc occupancy in MiB.", + []string{"socketId", "containerId", "containerName", "podName", "nameSpace"}, nil, + ) + cpuUtilizationDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "cpu_utilization"), + "The CPU utilization of the container refers to the number of CPUs it uses.", + []string{"containerId", "containerName", "podName", "nameSpace"}, nil, + ) + memoryDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, containerCollectorSubsystem, "memory"), + "The memory usage of the container in MiB.", + []string{"containerId", "containerName", "podName", "nameSpace"}, nil, + ) + //node + nodeTotalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeCollectorSubsystem, "total_memory_bandwidth"), + "The sum of total memory bandwidth for all sockets in MBps.", + []string{"nodeName"}, nil, + ) + nodeLocalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeCollectorSubsystem, "local_memory_bandwidth"), + "The sum of local memory bandwidth for all sockets in MBps.", + []string{"nodeName"}, nil, + ) + socketTotalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, socketCollectorSubsystem, "total_memory_bandwidth"), + "One socket total memory bandwidth in MBps.", + []string{"socketId", "nodeName"}, nil, + ) + socketLocalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, socketCollectorSubsystem, "local_memory_bandwidth"), + "One socket local memory bandwidth in MBps.", + []string{"socketId", "nodeName"}, nil, + ) + nodeLLCacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, nodeCollectorSubsystem, "llc_occupancy"), + "The sum of llc occupancy for all sockets in MiB.", + []string{"nodeName"}, nil, + ) + socketLLCacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, socketCollectorSubsystem, "llc_occupancy"), + "One socket llc occupancy in MiB.", + []string{"socketId", "nodeName"}, nil, + ) + // class + classTotalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "total_memory_bandwidth"), + "The sum of total memory bandwidth for all sockets in MBps.", + []string{"className", "nodeName"}, nil, + ) + classLocalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "local_memory_bandwidth"), + "The sum of local memory bandwidth for all sockets in MBps.", + []string{"className", "nodeName"}, nil, + ) + classLLCacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "llc_occupancy"), + "The sum of llc occupancy for all sockets in MiB.", + []string{"className", "nodeName"}, nil, + ) + socketClassTotalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "socket_total_memory_bandwidth"), + "One socket total memory bandwidth in MBps.", + []string{"socketId", "className", "nodeName"}, nil, + ) + socketClassLocalMemoryBandwidthDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "socket_local_memory_bandwidth"), + "One socket local memory bandwidth in MBps.", + []string{"socketId", "className", "nodeName"}, nil, + ) + socketClassLLCacheDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, classCollectorSubsystem, "socket_llc_occupancy"), + "One socket llc occupancy in MiB.", + []string{"socketId", "className", "nodeName"}, nil, + ) +) + +type stats struct { + oldStats RawStats + processedStats ProcessedStats +} + +type ProcessedStats struct { + socketNum int + SumMemoryBandwidth ProcessedMemoryBandwidthStats + SumCache ProcessedCacheStats + MemoryBandwidth map[string]ProcessedMemoryBandwidthStats + Cache map[string]ProcessedCacheStats + CPUUtilization float64 // cpu nums, not % + Memory float64 // MiB +} + +type RawStats struct { + SocketNum int + MemoryBandwidth map[string]RawMemoryBandwidthStats + Cache map[string]RawCacheStats + CPUUtilization *RawCPUStats + Memory int64 // bytes +} + +type RawCPUStats struct { + CPU int64 // microseconds + TimeStamp string +} + +type ProcessedMemoryBandwidthStats struct { + // The 'mbm_total_bytes' to MBps + TotalMBps float64 + // The 'mbm_local_bytes'. to MBps + LocalMBps float64 +} + +// MemoryBandwidthStats corresponds to MBM (Memory Bandwidth Monitoring). +type RawMemoryBandwidthStats struct { + // The 'mbm_total_bytes' + TotalBytes uint64 + TotalBytesTimeStamp string + // The 'mbm_local_bytes'. + LocalBytes uint64 + LocalBytesTimeStamp string +} + +type RawCacheStats struct { + // The 'llc_occupancy' + LLCOccupancy uint64 +} + +type ProcessedCacheStats struct { + // The 'llc_occupancy' to MiB + LLCOccupancy float64 +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/collector/node_collector.go b/kubernetes-addons/memory-bandwidth-exporter/collector/node_collector.go new file mode 100644 index 00000000..08fbbe96 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/collector/node_collector.go @@ -0,0 +1,145 @@ +package collector + +import ( + "strings" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + nodeCollectorSubsystem = "node" + socketCollectorSubsystem = "socket" +) + +type nodeCollctor struct { + interval time.Duration + logger log.Logger + nodeName string + statsCache *stats + monGroupPath string + metrics map[string]struct{} +} + +func init() { + registerCollector(nodeCollectorSubsystem, defaultDisabled, NewNodeCollector) +} + +// NewNodeCollector returns a new Collector exposing node level memory bandwidth metrics. +func NewNodeCollector(logger log.Logger, interval time.Duration) (Collector, error) { + c := &nodeCollctor{ + interval: interval, + logger: logger, + monGroupPath: rootResctrlPath, + nodeName: *nodeName, + metrics: make(map[string]struct{}), + } + logger.Log("info", "new node collector", "metrics:", *nodeCollectorMetrics) + if *nodeCollectorMetrics == allMetrics { + for _, m := range allNodeMetrics { + c.metrics[m] = struct{}{} + } + } else if *nodeCollectorMetrics != noMetrics { + for _, m := range strings.Split(*nodeCollectorMetrics, ",") { + c.metrics[m] = struct{}{} + } + } + c.Start() + return c, nil +} + +func (c *nodeCollctor) Start() { + c.logger.Log("info", "start node collector", "metrics", getMetricsKeys(c.metrics)) + go func() { + for { + err := c.updateStats() + if err != nil { + c.logger.Log("error", "node collector update stats failed", "err", err) + } + time.Sleep(jitter(c.interval)) + } + }() +} + +func (c *nodeCollctor) updateStats() error { + newStats := RawStats{} + var err error + if isNeedCollectMbLLc(c.metrics) { + newStats.SocketNum, newStats.MemoryBandwidth, newStats.Cache, err = getIntelRDTStatsFrom(c.monGroupPath) + if err != nil { + return err + } + } + if c.statsCache != nil { + pStats, err := processStats(c.statsCache.oldStats, newStats) + if err != nil { + return err + } + c.statsCache = &stats{ + oldStats: newStats, + processedStats: pStats, + } + } else { + c.statsCache = &stats{ + oldStats: newStats, + processedStats: ProcessedStats{}, + } + } + return nil +} + +func (c *nodeCollctor) Update(ch chan<- prometheus.Metric) error { + if c.statsCache == nil { + c.logger.Log("info", "node collector stats have no cache") + return nil + } + if !isNeedCollectMbLLc(c.metrics) { + return nil + } + ch <- prometheus.MustNewConstMetric( + nodeTotalMemoryBandwidthDesc, + prometheus.GaugeValue, + c.statsCache.processedStats.SumMemoryBandwidth.TotalMBps, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + nodeLocalMemoryBandwidthDesc, + prometheus.GaugeValue, + c.statsCache.processedStats.SumMemoryBandwidth.LocalMBps, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + nodeLLCacheDesc, + prometheus.GaugeValue, + c.statsCache.processedStats.SumCache.LLCOccupancy, + c.nodeName, + ) + for socket, stats := range c.statsCache.processedStats.Cache { + ch <- prometheus.MustNewConstMetric( + socketLLCacheDesc, + prometheus.GaugeValue, + stats.LLCOccupancy, + socket, + c.nodeName, + ) + } + for socket, stats := range c.statsCache.processedStats.MemoryBandwidth { + ch <- prometheus.MustNewConstMetric( + socketTotalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.TotalMBps, + socket, + c.nodeName, + ) + ch <- prometheus.MustNewConstMetric( + socketLocalMemoryBandwidthDesc, + prometheus.GaugeValue, + stats.LocalMBps, + socket, + c.nodeName, + ) + } + + return nil +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/config/config.sh b/kubernetes-addons/memory-bandwidth-exporter/config/config.sh new file mode 100755 index 00000000..69b80a36 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/config/config.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +sudo apt install jq +sudo mkdir -p /etc/containers/oci/hooks.d/ +sudo cp plugin/etc/containers/oci/hooks.d/* /etc/containers/oci/hooks.d/ +sudo cp plugin/usr/local/sbin/* /usr/local/sbin/ diff --git a/kubernetes-addons/memory-bandwidth-exporter/config/manifests/memory-bandwidth-exporter.yaml b/kubernetes-addons/memory-bandwidth-exporter/config/manifests/memory-bandwidth-exporter.yaml new file mode 100644 index 00000000..de7b7706 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/config/manifests/memory-bandwidth-exporter.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: memory-bandwidth-exporter + namespace: system +spec: + selector: + matchLabels: + app: memory-bandwidth-exporter + template: + metadata: + labels: + app: memory-bandwidth-exporter + spec: + containers: + - name: memory-bandwidth-exporter + image: MBE_IMG + imagePullPolicy: IfNotPresent + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE_WHITELIST + value: "calico-apiserver,calico-system,kube-system,tigera-operator" + volumeMounts: + - name: hooks + mountPath: /etc/containers/oci/hooks.d/ + readOnly: true + - name: usr + mountPath: /usr/ + readOnly: true + - name: resctrl + mountPath: /sys/fs/resctrl/ + - name: nri + mountPath: /var/run/nri/ + readOnly: true + securityContext: + privileged: true + ports: + - containerPort: 9100 + name: http + volumes: + - name: hooks + hostPath: + path: /etc/containers/oci/hooks.d/ + - name: usr + hostPath: + path: /usr/ + - name: resctrl + hostPath: + path: /sys/fs/resctrl/ + - name: nri + hostPath: + path: /var/run/nri/ + hostPID: true diff --git a/kubernetes-addons/memory-bandwidth-exporter/go.mod b/kubernetes-addons/memory-bandwidth-exporter/go.mod new file mode 100644 index 00000000..0932010a --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/go.mod @@ -0,0 +1,49 @@ +module github.com/opea-project/GenAIInfra/kubernetes-addons/memory-bandwidth-exporter + +go 1.22.0 + +toolchain go1.22.4 + +require ( + github.com/alecthomas/kingpin/v2 v2.4.0 + github.com/containerd/nri v0.6.1 + github.com/containers/common v0.59.1 + github.com/go-kit/log v0.2.1 + github.com/opencontainers/runtime-spec v1.2.0 + github.com/prometheus/client_golang v1.19.1 + github.com/prometheus/common v0.48.0 + github.com/prometheus/exporter-toolkit v0.11.0 + k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 +) + +require ( + github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/containerd/ttrpc v1.2.3 // indirect + github.com/containers/storage v1.54.0 // indirect + github.com/coreos/go-systemd/v22 v22.5.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-logfmt/logfmt v0.5.1 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/jpillora/backoff v1.0.0 // indirect + github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect + github.com/prometheus/client_model v0.5.0 // indirect + github.com/prometheus/procfs v0.12.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + github.com/xhit/go-str2duration/v2 v2.1.0 // indirect + golang.org/x/crypto v0.23.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/oauth2 v0.16.0 // indirect + golang.org/x/sync v0.7.0 // indirect + golang.org/x/sys v0.20.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect + google.golang.org/grpc v1.62.1 // indirect + google.golang.org/protobuf v1.33.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + k8s.io/cri-api v0.25.3 // indirect +) diff --git a/kubernetes-addons/memory-bandwidth-exporter/go.sum b/kubernetes-addons/memory-bandwidth-exporter/go.sum new file mode 100644 index 00000000..fdcfbaaa --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/go.sum @@ -0,0 +1,164 @@ +github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= +github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/containerd/nri v0.6.1 h1:xSQ6elnQ4Ynidm9u49ARK9wRKHs80HCUI+bkXOxV4mA= +github.com/containerd/nri v0.6.1/go.mod h1:7+sX3wNx+LR7RzhjnJiUkFDhn18P5Bg/0VnJ/uXpRJM= +github.com/containerd/ttrpc v1.2.3 h1:4jlhbXIGvijRtNC8F/5CpuJZ7yKOBFGFOOXg1bkISz0= +github.com/containerd/ttrpc v1.2.3/go.mod h1:ieWsXucbb8Mj9PH0rXCw1i8IunRbbAiDkpXkbfflWBM= +github.com/containers/common v0.59.1 h1:7VkmJN3YvD0jLFwaUjLHSRJ98JLffydiyOJjYr0dUTo= +github.com/containers/common v0.59.1/go.mod h1:53VicJCZ2AD0O+Br7VVoyrS7viXF4YmwlTIocWUT8XE= +github.com/containers/storage v1.54.0 h1:xwYAlf6n9OnIlURQLLg3FYHbO74fQ/2W2N6EtQEUM4I= +github.com/containers/storage v1.54.0/go.mod h1:PlMOoinRrBSnhYODLxt4EXl0nmJt+X0kjG0Xdt9fMTw= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= +github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= +github.com/go-logfmt/logfmt v0.5.1 h1:otpy5pqBCBZ1ng9RQ0dPu4PN7ba75Y/aA+UpowDyNVA= +github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/onsi/ginkgo/v2 v2.18.0 h1:W9Y7IWXxPUpAit9ieMOLI7PJZGaW22DTKgiVAuhDTLc= +github.com/onsi/ginkgo/v2 v2.18.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= +github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= +github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= +github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= +github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= +github.com/prometheus/exporter-toolkit v0.11.0 h1:yNTsuZ0aNCNFQ3aFTD2uhPOvr4iD7fdBvKPAEGkNf+g= +github.com/prometheus/exporter-toolkit v0.11.0/go.mod h1:BVnENhnNecpwoTLiABx7mrPB/OLRIgN74qlQbV+FK1Q= +github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= +github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= +github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= +golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw= +golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 h1:NnYq6UN9ReLM9/Y01KWNOWyI5xQ9kbIms5GGJVwS/Yc= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= +google.golang.org/grpc v1.62.1 h1:B4n+nfKzOICUXMgyrNd19h/I9oH0L1pizfk1d4zSgTk= +google.golang.org/grpc v1.62.1/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/cri-api v0.25.3 h1:YaiQ05CM4+5L2DAz0KoSa4sv4/VlQvLbf3WHKICPSXs= +k8s.io/cri-api v0.25.3/go.mod h1:riC/P0yOGUf2K1735wW+CXs1aY2ctBgePtnnoFLd0dU= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= diff --git a/kubernetes-addons/memory-bandwidth-exporter/info/container.go b/kubernetes-addons/memory-bandwidth-exporter/info/container.go new file mode 100644 index 00000000..00f840d3 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/info/container.go @@ -0,0 +1,17 @@ +package info + +var ( + chanSize = 100 + // containerid:ContainerInfo + ContainerInfoChan = make(chan map[string]ContainerInfo, chanSize) +) + +type ContainerInfo struct { + Operation int // 0:delete 1:add 2:add and watch + ContainerName string + ContainerId string + PodName string + NameSpace string + CgroupPath string + MonGroupPath string +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/create-runtime.json b/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/create-runtime.json new file mode 100644 index 00000000..9e3a6b7d --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/create-runtime.json @@ -0,0 +1,10 @@ +{ + "version": "1.0.0", + "hook": { + "path": "/usr/local/sbin/create-runtime.sh" + }, + "when": { + "always": true + }, + "stages": ["createRuntime"] +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/post-stop.json b/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/post-stop.json new file mode 100644 index 00000000..7f295cc9 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/plugin/etc/containers/oci/hooks.d/post-stop.json @@ -0,0 +1,10 @@ +{ + "version": "1.0.0", + "hook": { + "path": "/usr/local/sbin/post-stop.sh" + }, + "when": { + "always": true + }, + "stages": ["poststop"] +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/plugin/hook-injector.go b/kubernetes-addons/memory-bandwidth-exporter/plugin/hook-injector.go new file mode 100644 index 00000000..5ec74a0e --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/plugin/hook-injector.go @@ -0,0 +1,225 @@ +package plugin + +import ( + "context" + "fmt" + "os" + "strings" + + "github.com/containers/common/pkg/hooks" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/opea-project/GenAIInfra/kubernetes-addons/memory-bandwidth-exporter/info" + rspec "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/containerd/nri/pkg/api" + "github.com/containerd/nri/pkg/stub" +) + +const ( + NriPluginName = "memory-bandwidth-exporter.v1" + conSuffix = ".scope" + podPrefix = "/sys/fs/cgroup" + conPrefix = "/cri-containerd-" + rootMonDir = "/sys/fs/resctrl/mon_groups/" + monGroupPrefix = "container-" + annRDT = "rdt.resources.beta.kubernetes.io/pod" +) + +var ( + isNeedMakeMonitorGroup = true +) + +type Plugin struct { + PluginName string + PluginIdx string + DisableWatch bool + Stub stub.Stub + Mgr *hooks.Manager + Logger log.Logger +} + +func (p *Plugin) Synchronize(_ context.Context, pod []*api.PodSandbox, container []*api.Container) ( + []*api.ContainerUpdate, error) { + dropRepeat := make(map[string]int) + for _, container := range container { + if _, ok := dropRepeat[container.Id]; ok { + continue + } + dropRepeat[container.Id] = 1 + for _, pod := range pod { + if _, ok := pod.Annotations[annRDT]; ok { + continue + } + if pod.Id == container.PodSandboxId { + cgroupPath := podPrefix + pod.Linux.CgroupParent + conPrefix + container.Id + conSuffix + monPath := rootMonDir + monGroupPrefix + container.Id + cif := info.ContainerInfo{ + Operation: 2, + ContainerName: container.Name, + ContainerId: container.Id, + PodName: pod.Name, + NameSpace: pod.Namespace, + CgroupPath: cgroupPath, + MonGroupPath: monPath, + } + ContainerInfoes := make(map[string]info.ContainerInfo) + ContainerInfoes[container.Id] = cif + if len(info.ContainerInfoChan) == cap(info.ContainerInfoChan) { + return nil, fmt.Errorf("ContainerInfoChan is full") + } + info.ContainerInfoChan <- ContainerInfoes + break + } + } + } + return nil, nil +} + +func (p *Plugin) CreateContainer(_ context.Context, pod *api.PodSandbox, container *api.Container) ( + *api.ContainerAdjustment, []*api.ContainerUpdate, error) { + if !isNeedMakeMonitorGroup { + return nil, nil, nil + } + + ctrName := containerName(pod, container) + + if val, ok := pod.Annotations[annRDT]; ok { + level.Info(p.Logger).Log("msg", "container %v has rdt annotation %v", ctrName, val) + return nil, nil, nil + } + + annotations := map[string]string{} + for k, v := range container.Annotations { + annotations[k] = v + } + for k, v := range pod.Annotations { + annotations[k] = v + } + hasBindMounts := len(container.Mounts) > 0 + + spec := &rspec.Spec{ + Process: &rspec.Process{ + Args: container.Args, + }, + } + + if _, err := p.Mgr.Hooks(spec, annotations, hasBindMounts); err != nil { + level.Error(p.Logger).Log("msg", "failed to generate hooks", "container", ctrName, "err", err) + return nil, nil, fmt.Errorf("hook generation failed: %w", err) + } + + if spec.Hooks == nil { + level.Info(p.Logger).Log("msg", "container %v has no hooks to inject, ignoring", ctrName) + return nil, nil, nil + } + + adjust := &api.ContainerAdjustment{} + adjust.AddHooks(api.FromOCIHooks(spec.Hooks)) + level.Info(p.Logger).Log("msg", "OCI hooks injected", "container", ctrName) + + return adjust, nil, nil +} + +func (p *Plugin) StartContainer(_ context.Context, pod *api.PodSandbox, container *api.Container) error { + if _, ok := pod.Annotations[annRDT]; ok { + return nil + } + + level.Info(p.Logger).Log("msg", "StartContainer stage", "container frist pid", container.Pid) + + cif := info.ContainerInfo{ + Operation: 1, + ContainerName: container.Name, + ContainerId: container.Id, + PodName: pod.Name, + NameSpace: pod.Namespace, + CgroupPath: podPrefix + pod.Linux.CgroupParent + conPrefix + container.Id + conSuffix, + MonGroupPath: rootMonDir + monGroupPrefix + container.Id, + } + ContainerInfoes := make(map[string]info.ContainerInfo) + ContainerInfoes[container.Id] = cif + if len(info.ContainerInfoChan) == cap(info.ContainerInfoChan) { + return fmt.Errorf("ContainerInfoChan is full") + } + info.ContainerInfoChan <- ContainerInfoes + return nil +} + +func (p *Plugin) StopContainer(_ context.Context, pod *api.PodSandbox, container *api.Container) ( + []*api.ContainerUpdate, error) { + if _, ok := pod.Annotations[annRDT]; ok { + return nil, nil + } + + cif := info.ContainerInfo{ + Operation: 0, + ContainerName: container.Name, + ContainerId: container.Id, + } + ContainerInfoes := make(map[string]info.ContainerInfo) + ContainerInfoes[container.Id] = cif + if len(info.ContainerInfoChan) == cap(info.ContainerInfoChan) { + return nil, fmt.Errorf("ContainerInfoChan is full") + } + info.ContainerInfoChan <- ContainerInfoes + return nil, nil +} + +func containerName(pod *api.PodSandbox, container *api.Container) string { + if pod != nil { + return pod.Name + "/" + container.Name + } + return container.Name +} + +func (p *Plugin) Run(isNeed bool) error { + isNeedMakeMonitorGroup = isNeed + var ( + opts []stub.Option + mgr *hooks.Manager + err error + ) + + if p.PluginName != "" { + opts = append(opts, stub.WithPluginName(p.PluginName)) + } + if p.PluginIdx != "" { + opts = append(opts, stub.WithPluginIdx(p.PluginIdx)) + } + + if p.Stub, err = stub.New(p, opts...); err != nil { + return fmt.Errorf("failed to create plugin stub: %v", err) + } + + ctx := context.Background() + dirs := []string{hooks.DefaultDir, hooks.OverrideDir} + mgr, err = hooks.New(ctx, dirs, []string{}) + if err != nil { + return fmt.Errorf("failed to set up hook manager: %v", err) + } + p.Mgr = mgr + + if !p.DisableWatch { + for _, dir := range dirs { + if err = os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %q: %v", dir, err) + } + } + + sync := make(chan error, 2) + go mgr.Monitor(ctx, sync) + + err = <-sync + if err != nil { + return fmt.Errorf("failed to monitor hook directories: %v", err) + } + level.Info(p.Logger).Log("msg", "watching directories for new changes", "dirs", strings.Join(dirs, " ")) + } + + err = p.Stub.Run(ctx) + if err != nil { + return fmt.Errorf("plugin exited with error %v", err) + } + return nil +} diff --git a/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/create-runtime.sh b/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/create-runtime.sh new file mode 100755 index 00000000..880bfd29 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/create-runtime.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +container_state=$(cat) +container_pid=$(echo "$container_state" | jq -r '.pid') + +container_id=$(echo "$container_state" | jq -r '.id') +mon_group_dir="/sys/fs/resctrl/mon_groups/container-$container_id" +if [ ! -d "$mon_group_dir" ]; then + mkdir -p "$mon_group_dir" +fi + +tasks_file="$mon_group_dir/tasks" +echo "$container_pid" | sudo tee -a $tasks_file > /dev/null +echo "CREATE Container: $container_id, Container State: $container_state, Container PID: $container_pid" | sudo tee -a /var/log/container_pids.log > /dev/null diff --git a/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/post-stop.sh b/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/post-stop.sh new file mode 100755 index 00000000..f10b0b97 --- /dev/null +++ b/kubernetes-addons/memory-bandwidth-exporter/plugin/usr/local/sbin/post-stop.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +container_state=$(cat) +container_id=$(echo "$container_state" | jq -r '.id') +mon_group_dir="/sys/fs/resctrl/mon_groups/container-$container_id" +echo "DELETE Container: $container_id" >> /var/log/container_pids.log +sudo rmdir $mon_group_dir