Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubernetes integration #3

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions example/kubernetes/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM almalinux:8

ADD rocm.repo /etc/yum.repos.d/
ADD protobuf.repo /etc/yum.repos.d/
RUN yum install net-tools procps-ng gcc-c++ rdc protobuf python3-devel -y
ENV LD_LIBRARY_PATH /opt/rocm/rdc/lib:/opt/rocm/rdc/grpc/lib/
ENV PATH /root/.local/bin:$PATH
RUN pip3 install --user -U pip
RUN pip3 install --user protobuf==3.11.3
RUN pip3 install --user prometheus_client kuryr_kubernetes
RUN ln -s /lib64/libprotobuf.so.22 /lib64/libprotobuf.so.3.11.2.0
7 changes: 7 additions & 0 deletions example/kubernetes/protobuf.repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[ussuri]
name=protobuf
baseurl=https://buildlogs.centos.org/centos/8/cloud/x86_64/openstack-ussuri
enabled=1
fastestmirror_enabled=0
gpgcheck=0
priority=1
161 changes: 161 additions & 0 deletions example/kubernetes/rdc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: rdc
labels:
app: rdc
namespace: kube-system
spec:
selector:
matchLabels:
app: rdc
template:
metadata:
labels:
app: rdc
spec:
priorityClassName: system-node-critical
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
key: amd.com/gpu
operator: Exists
nodeSelector:
gpu: "amd"
containers:
- name: rdc
image: your_container_repository/rdc:5.1
imagePullPolicy: Always
command: ["/opt/rocm/rdc/bin/rdcd"]
args: ["-u"]
securityContext:
privileged: true
capabilities:
drop:
- all
lifecycle:
postStart:
exec:
command:
- /bin/bash
- -c
- while [ $(netstat -lntp |grep rdcd| wc -l) -ne 1 ]; do sleep 1; done
preStop:
exec:
command:
- /bin/bash
- -c
- while [ $(netstat -lntp |grep 5000| wc -l) -ne 0 ]; do sleep 1; done
readinessProbe:
tcpSocket:
port: 50051
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
tcpSocket:
port: 50051
initialDelaySeconds: 15
periodSeconds: 20
resources:
limits:
cpu: 100m
memory: 30Mi
requests:
cpu: 20m
memory: 30Mi
ports:
- containerPort: 50051
name: rdc
protocol: TCP
volumeMounts:
- name: sys
mountPath: /sys
- name: prometheus-rdc
command: ["python3"]
args: ["/opt/rocm/rdc/python_binding/rdc_prometheus.py", "--rdc_unauth", "--enable_kubernetes_integration"]
image: your_container_repository/rdc:5.1
imagePullPolicy: Always
securityContext:
privileged: true
capabilities:
drop:
- all
readinessProbe:
httpGet:
path: /metrics
port: metrics
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /metrics
port: metrics
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 20
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 20m
memory: 100Mi
ports:
- containerPort: 5000
name: metrics
protocol: TCP
volumeMounts:
- name: sys
mountPath: /sys
- name: podresources-api
mountPath: /var/lib/kubelet/pod-resources
terminationGracePeriodSeconds: 30
volumes:
- name: podresources-api
hostPath:
path: /var/lib/kubelet/pod-resources
- name: sys
hostPath:
path: /sys
---
apiVersion: v1
kind: Service
metadata:
labels:
app: rdc
name: rdc
namespace: kube-system
spec:
ports:
- port: 5000
protocol: TCP
targetPort: 5000
name: metrics
selector:
app: rdc
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: rdc
namespace: kube-system
spec:
endpoints:
- interval: 30s
path: /metrics
port: metrics
relabelings:
- sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: node
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app: rdc



7 changes: 7 additions & 0 deletions example/kubernetes/rocm.repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[rocm]
name=rocm
baseurl=https://repo.radeon.com/rocm/centos8/latest/main
enabled=1
fastestmirror_enabled=0
gpgcheck=0
priority=1
77 changes: 69 additions & 8 deletions python_binding/rdc_prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

class PrometheusReader(RdcReader):
def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples,
gpu_indexes, rdc_unauth, enable_plugin_monitoring):
gpu_indexes, rdc_unauth, enable_plugin_monitoring, enable_kubernetes_integration):
group_name = "rdc_prometheus_plugin_group"
field_group_name = "rdc_prometheus_plugin_fieldgroup"
if rdc_unauth:
Expand All @@ -32,15 +32,75 @@ def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_s
REGISTRY.unregister(PROCESS_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)

# Create the guages
self.guages = {}
self.enable_kubernetes_integration = enable_kubernetes_integration

# Create the gauges
self.gauges = {}
for fid in self.field_ids:
field_name = self.rdc_util.field_id_string(fid).lower()
self.guages[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])
if enable_kubernetes_integration:
self.gauges[fid] = Gauge(field_name, field_name, labelnames=['gpu_index', 'pod', 'namespace', 'container'])
else:
self.gauges[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])

if enable_kubernetes_integration:
import sys, os
sys.path.append('/opt/rocm/bin')
from rocm_smi import getBus, initializeRsmi
from kuryr_kubernetes.pod_resources.client import PodResourcesClient

# Create kubelet client for podresources api to get pcie bus address of attached gpu
self.pr_client = PodResourcesClient(os.getenv('RDC_KUBERNETES_KUBELET_PATH','/var/lib/kubelet'))

self.empty_label_value = os.getenv('RDC_KUBERNETES_EMPTY_LABEL_VALUE','')

initializeRsmi()

# Cache mapping between gpu indexes and PCIe bus addresses, assumes no hotplug of gpus
self.index_to_bus_addr = {}
for item in self.gpu_indexes:
self.index_to_bus_addr[item] = getBus(item)

def process(self):
# Make sure no other thread collects metrics before we are fully finished with them
with REGISTRY._lock:
if self.enable_kubernetes_integration:
from google.protobuf.json_format import MessageToDict
# Get list of all pods and their containers with devices attached to them
self.pod_list = MessageToDict(self.pr_client.list())
# Clear the labels and populate them later again
for fid in self.field_ids:
self.gauges[fid].clear()
RdcReader.process(self)

def handle_field(self, gpu_index, value):
if value.field_id.value in self.guages:
self.guages[value.field_id.value].labels(gpu_index).set(value.value.l_int)
fid = value.field_id.value
if fid in self.gauges:
if self.enable_kubernetes_integration:
gpu_bus_addr = self.index_to_bus_addr[gpu_index]
# Check if currently processed gpu is attached to any container, single gpu can only be attached to a single container
container_data = self.findContainer(gpu_bus_addr)
if container_data:
self.gauges[fid].labels(gpu_index=gpu_index, pod=container_data['pod'], namespace=container_data['namespace'], container=container_data['container']).set(value.value.l_int)
else:
self.gauges[fid].labels(gpu_index=gpu_index, pod=self.empty_label_value, namespace=self.empty_label_value, container=self.empty_label_value).set(value.value.l_int)
else:
self.gauges[fid].labels(gpu_index).set(value.value.l_int)

def findContainer(self,dev_id):
container_dict = {}
for pod in self.pod_list['podResources']:
for container in pod['containers']:
if 'devices' in container:
for device in container['devices']:
if device['resourceName'] == 'amd.com/gpu':
if device['deviceIds'][0] == dev_id:
container_dict['container'] = container['name']
container_dict['pod'] = pod['name']
container_dict['namespace'] = pod['namespace']
return container_dict
return container_dict


def get_field_ids(args):
field_ids = []
Expand Down Expand Up @@ -81,7 +141,8 @@ def get_field_ids(args):
parser.add_argument('--rdc_fields_file', default=None, help='The list of fields name can also be read from a file with each field name in a separated line (default: None)')
parser.add_argument('--rdc_gpu_indexes', default=None, nargs='+', help='The list of GPUs to be watched (default: All GPUs)')
parser.add_argument('--enable_plugin_monitoring', default=False, action='store_true', help = 'Set this option to collect process metrics of the plugin itself (default: false)')

parser.add_argument('--enable_kubernetes_integration', default=False, action='store_true', help='Set this option if you want per pod gpu monitoring in kubernetes (default: false)')

args = parser.parse_args()

field_ids = get_field_ids(args)
Expand All @@ -94,7 +155,7 @@ def get_field_ids(args):

reader = PrometheusReader(rdc_ip_port, field_ids, args.rdc_update_freq*1000000,
args.rdc_max_keep_age, args.rdc_max_keep_samples,
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_plugin_monitoring)
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_plugin_monitoring, args.enable_kubernetes_integration)
start_http_server(args.listen_port)
print("The RDC Prometheus plugin listen at port %d" % (args.listen_port))
time.sleep(3)
Expand Down