Skip to content

Commit

Permalink
Add memory bandwidth exporter for AI workload.
Browse files Browse the repository at this point in the history
Signed-off-by: Yugar-1 <[email protected]>
  • Loading branch information
Yugar-1 committed Sep 5, 2024
1 parent 0094f52 commit 87ff280
Show file tree
Hide file tree
Showing 22 changed files with 2,506 additions and 1 deletion.
5 changes: 4 additions & 1 deletion .github/workflows/pr-go-unittests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ jobs:
- name: Run tests and generate coverage
run: |
if [ "${{ matrix.gopath }}" == "${MBE_DIR}" ]; then
exit 0
fi
cd ${{ matrix.gopath }}
go test -coverprofile=coverage.out $(go list ./... | grep -v /e2e)
../.github/workflows/scripts/go-coverage.sh
${{ github.workspace }}/.github/workflows/scripts/go-coverage.sh
62 changes: 62 additions & 0 deletions kubernetes-addons/memory-bandwidth-exporter/.golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

run:
timeout: 5m
allow-parallel-runners: true

issues:
exclude-use-default: false
exclude-rules:
- path: _test.go
linters:
- errcheck

linters:
disable-all: true
enable:
- depguard
- misspell
- revive
- dupl
- errcheck
- exportloopref
- goconst
- gocyclo
- gofmt
- goimports
- gosimple
- govet
- ineffassign
- lll
- misspell
- nakedret
- prealloc
- staticcheck
- typecheck
- unconvert
- unparam
- unused


linters-settings:
depguard:
rules:
no_exec_policy:
files:
- "!$test"
deny:
- pkg: "os/exec"
desc: "Using os/exec to run sub processes it not allowed by policy"
errcheck:
exclude-functions:
# Used in HTTP handlers, any error is handled by the server itself.
- (net/http.ResponseWriter).Write
# Never check for logger errors.
- (github.com/go-kit/log.Logger).Log
revive:
rules:
# https://github.com/mgechev/revive/blob/master/RULES_DESCRIPTIONS.md#unused-parameter
- name: unused-parameter
severity: warning
disabled: true
17 changes: 17 additions & 0 deletions kubernetes-addons/memory-bandwidth-exporter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM golang:1.22 AS builder
ARG TARGETOS
ARG TARGETARCH

WORKDIR /workspace
COPY . /workspace/
RUN go mod download

RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o memory-bandwidth-exporter cmd/main.go

FROM ubuntu:22.04
USER root
WORKDIR /
COPY --from=builder /workspace/memory-bandwidth-exporter .

ENTRYPOINT ["bash", "-c"]
CMD ["/memory-bandwidth-exporter --collector.node.name=${NODE_NAME} --collector.container.namespaceWhiteList=${NAMESPACE_WHITELIST}"]
75 changes: 75 additions & 0 deletions kubernetes-addons/memory-bandwidth-exporter/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
GO_CMD := go
DEBUG ?= 0
DOCKER_REGISTRY ?= docker.io/opea
CONTAINER_TOOL ?= docker
VERSION ?= latest

ifeq ($(DEBUG),0)
GOFLAGS=-ldflags="all=-s -w"
endif

MBE_IMG_NAME = memory-bandwidth-exporter:$(VERSION)
MBE_IMG = ${DOCKER_REGISTRY}/${MBE_IMG_NAME}

build:
@mkdir -p bin
@echo "Building memory-bandwidth-exporter binary..."
$(GO_CMD) build -o bin/memory-bandwidth-exporter $(GOFLAGS) cmd/main.go

docker.build:
@echo "Building memory-bandwidth-exporter Docker image..."
$(CONTAINER_TOOL) build -t ${MBE_IMG} -f Dockerfile .

docker.push:
@echo "Push memory-bandwidth-exporter Docker image..."
$(CONTAINER_TOOL) push ${MBE_IMG}

clean:
@echo "Cleaning up..."
rm -rf bin

change_img:
sed -i "s\MBE_IMG\${MBE_IMG}\g" config/manifests/memory-bandwidth-exporter.yaml

test:
@echo "Running tests..."
$(GO_CMD) test ./...

lint: golangci-lint ## Run golangci-lint linter & yamllint
@echo "Running linters...${GOLANGCI_LINT}"
$(GOLANGCI_LINT) run ./...

lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
$(GOLANGCI_LINT) run --fix ./...

##@ Dependencies

## Location to install dependencies to
LOCALBIN ?= $(shell pwd)/bin
$(LOCALBIN):
mkdir -p $(LOCALBIN)

## Tool Binaries
GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION)

## Tool Versions
GOLANGCI_LINT_VERSION ?= v1.59.1

.PHONY: golangci-lint
golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
$(GOLANGCI_LINT): $(LOCALBIN)
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,${GOLANGCI_LINT_VERSION})

# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
# $1 - target path with name of binary (ideally with version)
# $2 - package url which can be installed
# $3 - specific version of package
define go-install-tool
@[ -f $(1) ] || { \
set -e; \
package=$(2)@$(3) ;\
echo "Downloading $${package}" ;\
GOBIN=$(LOCALBIN) go install $${package} ;\
mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\
}
endef
148 changes: 148 additions & 0 deletions kubernetes-addons/memory-bandwidth-exporter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# memory bandwidth exporter

Pod/container grained memory bandwidth exporter provides users memory bandwidth metrics of their running containers. The metrics include llc_occupancy, mbm_local_bytes, mbm_total_bytes, cpu utilization and memory usage, and the metrics have been processed. In addition to container-level metrics, it also provides class-level and socket-level metrics. Users can configure the list of metrics to be collected. It serves as an exporter which can be connected to Promethus-like obserbility tools. And it also can be used as a telementry provider.

Memory bandwidth exporter makes use of state-of-the-art technologies like NRI to build a resource-efficient and well-maintained solution. This solution provides observability to memory bandwidth to OPEA micro-services. It lays the groundwork of better scaling and auto scaling of OPEA. It can also be deployed separately on end user environments, supporting any cases that memory bandwidth metrics are required.

## Setup

### Enable NRI in Containerd

```sh
# download containerd binary, containerd version v1.7.0 or higher is required
$ wget https://github.com/containerd/containerd/releases/download/v1.7.0/containerd-1.7.0-linux-amd64.tar.gz

# stop running containerd
$ sudo systemctl stop containerd

# replace old containerd
$ sudo tar Cxzvf /usr/local containerd-1.7.0-linux-amd64.tar.gz

# enable NRI in containerd
# add an item in /etc/containerd/config.toml
[plugins."io.containerd.nri.v1.nri"]
disable = false
disable_connections = false
plugin_config_path = "/etc/containerd/certs.d"
plugin_path = "/opt/nri/plugins"
socket_path = "/var/run/nri/nri.sock"
config_file = "/etc/nri/nri.conf"

# restart containerd
$ sudo systemctl start containerd
$ sudo systemctl status containerd

# test nri
$ git clone https://github.com/containerd/nri
$ cd nri
$ make
$ ./build/bin/logger -idx 00
```

### Enable RDT

Mount resctrl to the directory `/sys/fs/resctrl`:

```sh
$ sudo mount -t resctrl resctrl /sys/fs/resctrl
```

### Setup memory bandwidth exporter

Before setup, you need to configure the runc hook:

```sh
$ ./config/config.sh
```

#### How to build the binary and setup?

```sh
$ make build
$ sudo ./bin/memory-bandwidth-exporter
# e.g., sudo ./bin/memory-bandwidth-exporter --collector.node.name=<node_name> --collector.container.namespaceWhiteList="calico-apiserver,calico-system,kube-system,tigera-operator"

# get memory bandwidth metrics
$ curl http://localhost:9100/metrics
```

#### How to build the docker image and setup?

```sh
$ make docker.build
$ sudo docker run \
-e NODE_NAME=<node_name> \
-e NAMESPACE_WHITELIST="calico-apiserver,calico-system,kube-system,tigera-operator" \
--mount type=bind,source=/etc/containers/oci/hooks.d/,target=/etc/containers/oci/hooks.d/ \
--privileged \
--cgroupns=host \
--pid=host \
--mount type=bind,source=/usr/,target=/usr/ \
--mount type=bind,source=/sys/fs/resctrl/,target=/sys/fs/resctrl/ \
--mount type=bind,source=/var/run/nri/,target=/var/run/nri/ \
-d -p 9100:9100 \
--name=memory-bandwidth-exporter \
opea/memory-bandwidth-exporter:latest

# get memory bandwidth metrics
$ curl http://localhost:9100/metrics
```

#### How to deploy on the K8s cluster?

Build and push your image to the location specified by `MBE_IMG`, and apply manifest:

```sh
make docker.build docker.push MBE_IMG=<some-registry>/opea/memory-bandwidth-exporter:<tag>
make change_img MBE_IMG=<some-registry>/opea/memory-bandwidth-exporter:<tag>
kubectl create ns system
kubectl apply -f config/manifests/memory-bandwidth-exporter.yaml
```

Check the installation result:

```sh
kubectl get pods -n system
NAME READY STATUS RESTARTS AGE
memory-bandwidth-exporter-zxhdl 1/1 Running 0 3m
```

get memory bandwidth metrics

```sh
$ curl http://<memory_bandwidth_exporter_container_ip>:9100/metrics
```

#### How to delete binary?

```sh
$ make clean
```

## More flags about memory bandwidth exporter

There are some flags to help users better use memory bandwidth exporter:

```sh
-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man).
--collector.node.name="" Give node name.
--collector.container.namespaceWhiteList="" Filter out containers whose namespaces belong to the namespace whitelist, namespaces separated by commas, like "xx,xx,xx".
--collector.container.monTimes=10 Scan the pids of containers created before the exporter starts to prevent the loss of pids.
--collector.container.metrics="all" Enable container collector metrics.
--collector.class.metrics="none" Enable class collector metrics.
--collector.node.metrics="none" Enable node collector metrics.
--web.telemetry-path="/metrics" Path under which to expose metrics.
--[no-]web.disable-exporter-metrics Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).
--web.max-requests=40 Maximum number of parallel scrape requests. Use 0 to disable.
--runtime.gomaxprocs=1 The target number of CPUs Go will run on (GOMAXPROCS) ($GOMAXPROCS)
--[no-]web.systemd-socket Use systemd socket activation listeners instead of port listeners (Linux only).
--web.listen-address=:9100 ... Addresses on which to expose metrics and web interface. Repeatable for multiple addresses.
--web.config.file="" Path to configuration file that can enable TLS or authentication. See: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
--collector.interval=3s memory bandwidth exporter collect metrics interval
--NRIplugin.name="mb-nri-plugin" Plugin name to register to NRI
--NRIplugin.idx="11" Plugin index to register to NRI
--[no-]disableWatch Disable watching hook directories for new hooks
--log.level=info Only log messages with the given severity or above. One of: [debug, info, warn, error]
--log.format=logfmt Output format of log messages. One of: [logfmt, json]
--[no-]version Show application version.
```
Loading

0 comments on commit 87ff280

Please sign in to comment.