-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This workload uses egressip object with egress IP addesses. client pods use this ip addresses to communicate with external server. client validates if the packets data matches with the egressIP. README.md is updated with more details about the workload. Signed-off-by: venkataanil <[email protected]>
- Loading branch information
1 parent
3ec370f
commit 5b4af01
Showing
12 changed files
with
570 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
kind: Deployment | ||
apiVersion: apps/v1 | ||
metadata: | ||
name: client-{{.Replica}}-{{.Iteration}} | ||
spec: | ||
replicas: {{.podReplicas}} | ||
selector: | ||
matchLabels: | ||
name: client-{{.Replica}}-{{.Iteration}} | ||
template: | ||
metadata: | ||
labels: | ||
name: client-{{.Replica}}-{{.Iteration}} | ||
app: client | ||
spec: | ||
topologySpreadConstraints: | ||
- maxSkew: 1 | ||
topologyKey: kubernetes.io/hostname | ||
whenUnsatisfiable: ScheduleAnyway | ||
labelSelector: | ||
matchLabels: | ||
app: client | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: node-role.kubernetes.io/worker | ||
operator: Exists | ||
- key: node-role.kubernetes.io/infra | ||
operator: DoesNotExist | ||
- key: node-role.kubernetes.io/workload | ||
operator: DoesNotExist | ||
containers: | ||
- name: client-app | ||
image: quay.io/cloud-bulldozer/eipvalidator:latest | ||
resources: | ||
requests: | ||
memory: "10Mi" | ||
cpu: "10m" | ||
ports: | ||
- containerPort: 8080 | ||
name: metrics | ||
env: | ||
- name: EXT_SERVER_HOST | ||
value: "{{.extServerHost}}" | ||
- name: EXT_SERVER_PORT | ||
value: "{{ add 9002 (mod .Iteration 60) }}" | ||
- name: EGRESS_IPS | ||
{{- $eips := (splitList " " (GetIPAddress .eipAddresses .Iteration .addrPerIteration) | join ",") }} | ||
value: "{{$eips}}" | ||
- name: DELAY_BETWEEN_REQ_SEC | ||
value: "1" | ||
- name: REQ_TIMEOUT_SEC | ||
value: "3" | ||
imagePullPolicy: Always | ||
securityContext: | ||
privileged: false | ||
volumeMounts: | ||
restartPolicy: Always | ||
strategy: | ||
type: RollingUpdate | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
--- | ||
apiVersion: k8s.ovn.org/v1 | ||
kind: EgressIP | ||
metadata: | ||
name: egressip-obj-{{.Iteration}} | ||
spec: | ||
egressIPs: | ||
{{range (splitList " " (GetIPAddress .eipAddresses .Iteration .addrPerIteration))}} | ||
- {{.}} | ||
{{end}} | ||
namespaceSelector: | ||
matchLabels: | ||
kubernetes.io/metadata.name: egressip-{{.Iteration}} | ||
podSelector: | ||
matchLabels: | ||
app: client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
--- | ||
global: | ||
gc: {{.GC}} | ||
gcMetrics: {{.GC_METRICS}} | ||
measurements: | ||
- name: podLatency | ||
thresholds: | ||
- conditionType: Ready | ||
metric: P99 | ||
threshold: 15s | ||
metricsEndpoints: | ||
{{ if .ES_SERVER }} | ||
- indexer: | ||
esServers: ["{{.ES_SERVER}}"] | ||
insecureSkipVerify: true | ||
defaultIndex: {{.ES_INDEX}} | ||
type: opensearch | ||
{{ end }} | ||
{{ if eq .LOCAL_INDEXING "true" }} | ||
- indexer: | ||
type: local | ||
metricsDirectory: collected-metrics-{{.UUID}} | ||
{{ end }} | ||
|
||
jobs: | ||
- name: egressip | ||
namespace: egressip | ||
jobIterations: {{.JOB_ITERATIONS}} | ||
qps: {{.QPS}} | ||
burst: {{.BURST}} | ||
namespacedIterations: true | ||
podWait: false | ||
waitWhenFinished: true | ||
preLoadImages: false | ||
preLoadPeriod: 15s | ||
namespaceLabels: | ||
security.openshift.io/scc.podSecurityLabelSync: false | ||
pod-security.kubernetes.io/enforce: privileged | ||
pod-security.kubernetes.io/audit: privileged | ||
pod-security.kubernetes.io/warn: privileged | ||
openshift.io/cluster-monitoring: true | ||
objects: | ||
|
||
- objectTemplate: prometheus_role.yml | ||
replicas: 1 | ||
|
||
- objectTemplate: prometheus_role_binding.yml | ||
replicas: 1 | ||
|
||
- objectTemplate: pod_monitor.yml | ||
replicas: 1 | ||
|
||
- objectTemplate: egressip-obj.yml | ||
replicas: 1 | ||
inputVars: | ||
eipAddresses: {{.EIP_ADDRESSES}} | ||
addrPerIteration: {{.ADDRESSES_PER_ITERATION}} | ||
|
||
- objectTemplate: deployment-client.yml | ||
replicas: 1 | ||
inputVars: | ||
podReplicas: 2 | ||
eipAddresses: {{.EIP_ADDRESSES}} | ||
addrPerIteration: {{.ADDRESSES_PER_ITERATION}} | ||
extServerHost: {{.EXTERNAL_SERVER_IP}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PodMonitor | ||
metadata: | ||
name: pod-monitor-{{.Replica}} | ||
spec: | ||
selector: | ||
matchLabels: | ||
app: client | ||
podMetricsEndpoints: | ||
- port: metrics | ||
interval: 15s | ||
scheme: http |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: Role | ||
metadata: | ||
name: prometheus-k8s | ||
rules: | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- pods | ||
verbs: | ||
- get | ||
- list | ||
- watch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: RoleBinding | ||
metadata: | ||
name: prometheus-k8s | ||
roleRef: | ||
apiGroup: rbac.authorization.k8s.io | ||
kind: Role | ||
name: prometheus-k8s | ||
subjects: | ||
- kind: ServiceAccount | ||
name: prometheus-k8s | ||
namespace: openshift-monitoring |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
# EgressIP App metrics | ||
- query: scale_eip_startup_latency_total | ||
metricName: eipStartupLatencyTotal | ||
instant: true | ||
|
||
- query: scale_eip_recovery_latency>0 | ||
metricName: eipRecoveryLatencyTotal | ||
instant: true | ||
|
||
- query: scale_startup_non_eip_total{}>0 | ||
metricName: startupEipNodeIPReqCount | ||
instant: true | ||
|
||
# API server | ||
- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0 | ||
metricName: schedulingThroughput | ||
|
||
- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 | ||
metricName: readOnlyAPICallsLatency | ||
|
||
- query: histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope)) > 0 | ||
metricName: mutatingAPICallsLatency | ||
|
||
- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH"}[2m])) by (verb,resource,code) > 0 | ||
metricName: APIRequestRate | ||
|
||
# Kubeproxy and OVN service sync latency | ||
|
||
- query: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 0 | ||
metricName: serviceSyncLatency | ||
|
||
- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{kind="service"}[2m])) by (le)) | ||
metricName: serviceSyncLatency | ||
|
||
# Containers & pod metrics | ||
|
||
- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|network-node-identity|multus|.*apiserver|authentication|.*controller-manager|.*scheduler|image-registry|operator-lifecycle-manager)|cilium|stackrox|calico.*|tigera.*"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 | ||
metricName: containerCPU-Masters | ||
|
||
- query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus|ingress)|cilium|stackrox|calico.*|tigera.*"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0 | ||
metricName: containerCPU-AggregatedWorkers | ||
|
||
- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(monitoring|sdn|ovn-kubernetes|multus|ingress)|stackrox"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 | ||
metricName: containerCPU-Infra | ||
|
||
- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|network-node-identity|sdn|multus|ingress|authentication|.*controller-manager|.*scheduler|image-registry|operator-lifecycle-manager)|cilium|stackrox|calico.*|tigera.*"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0 | ||
metricName: containerMemory-Masters | ||
|
||
- query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus|ingress)|cilium|stackrox|calico.*|tigera.*"} and on (node) kube_node_role{role="worker"}) by (container, namespace) | ||
metricName: containerMemory-AggregatedWorkers | ||
|
||
- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|multus|ingress|monitoring|image-registry)|cilium|stackrox|calico.*|tigera.*"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0 | ||
metricName: containerMemory-Infra | ||
|
||
# Node metrics: CPU & Memory | ||
|
||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0 | ||
metricName: nodeCPU-Masters | ||
|
||
- query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0 | ||
metricName: nodeCPU-AggregatedWorkers | ||
|
||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) > 0 | ||
metricName: nodeCPU-Infra | ||
|
||
# We compute memory utilization by substrating available memory to the total | ||
# | ||
- query: avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) | ||
metricName: nodeMemoryUtilization-AggregatedWorkers | ||
|
||
- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") | ||
metricName: nodeMemoryUtilization-Masters | ||
|
||
- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") | ||
metricName: nodeMemoryUtilization-Infra | ||
|
||
# Kubelet & CRI-O runtime metrics | ||
|
||
- query: irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) * 100 and on (node) topk(3,avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[{{ .elapsed }}:]) and on (node) kube_node_role{role="worker"}) | ||
metricName: kubeletCPU | ||
|
||
- query: process_resident_memory_bytes{service="kubelet",job="kubelet"} and on (node) topk(3,max_over_time(irate(process_resident_memory_bytes{service="kubelet",job="kubelet"}[2m])[{{ .elapsed }}:]) and on (node) kube_node_role{role="worker"}) | ||
metricName: kubeletMemory | ||
|
||
- query: irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) * 100 and on (node) topk(3,avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[{{ .elapsed }}:]) and on (node) kube_node_role{role="worker"}) | ||
metricName: crioCPU | ||
|
||
- query: process_resident_memory_bytes{service="kubelet",job="crio"} and on (node) topk(3,max_over_time(irate(process_resident_memory_bytes{service="kubelet",job="crio"}[2m])[{{ .elapsed }}:]) and on (node) kube_node_role{role="worker"}) | ||
metricName: crioMemory | ||
|
||
# Etcd metrics | ||
|
||
- query: sum(rate(etcd_server_leader_changes_seen_total[2m])) | ||
metricName: etcdLeaderChangesRate | ||
|
||
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m])) | ||
metricName: 99thEtcdDiskBackendCommitDurationSeconds | ||
|
||
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m])) | ||
metricName: 99thEtcdDiskWalFsyncDurationSeconds | ||
|
||
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) | ||
metricName: 99thEtcdRoundTripTimeSeconds | ||
|
||
- query: sum by (cluster_version)(etcd_cluster_version) | ||
metricName: etcdVersion | ||
instant: true | ||
|
||
# Cluster metrics | ||
|
||
- query: sum(kube_namespace_status_phase) by (phase) > 0 | ||
metricName: namespaceCount | ||
|
||
- query: sum(kube_pod_status_phase{}) by (phase) | ||
metricName: podStatusCount | ||
|
||
- query: count(kube_secret_info{}) | ||
metricName: secretCount | ||
instant: true | ||
|
||
- query: count(kube_deployment_labels{}) | ||
metricName: deploymentCount | ||
instant: true | ||
|
||
- query: count(kube_configmap_info{}) | ||
metricName: configmapCount | ||
instant: true | ||
|
||
- query: count(kube_service_info{}) | ||
metricName: serviceCount | ||
instant: true | ||
|
||
- query: count(openshift_route_created{}) | ||
metricName: routeCount | ||
instant: true | ||
|
||
- query: kube_node_role | ||
metricName: nodeRoles | ||
|
||
- query: sum(kube_node_status_condition{status="true"}) by (condition) | ||
metricName: nodeStatus | ||
|
||
- query: count(kube_replicaset_labels{}) | ||
metricName: replicaSetCount | ||
instant: true | ||
|
||
- query: count(kube_pod_info{} AND ON (pod) kube_pod_status_phase{phase="Running"}==1) by (node) | ||
metricName: podDistribution | ||
|
||
# Prometheus metrics | ||
|
||
- query: openshift:prometheus_tsdb_head_series:sum{job="prometheus-k8s"} | ||
metricName: prometheus-timeseriestotal | ||
|
||
- query: openshift:prometheus_tsdb_head_samples_appended_total:sum{job="prometheus-k8s"} | ||
metricName: prometheus-ingestionrate | ||
|
||
# Retain the raw CPU seconds totals for comparison | ||
- query: sum( node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="worker",role!="infra"}, "instance", "$1", "node", "(.+)") ) by (mode) | ||
metricName: nodeCPUSeconds-Workers | ||
instant: true | ||
|
||
- query: sum( node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)") ) by (mode) | ||
metricName: nodeCPUSeconds-Masters | ||
instant: true | ||
|
||
- query: sum( node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)") ) by (mode) | ||
metricName: nodeCPUSeconds-Infra | ||
instant: true | ||
|
||
- query: sum ( container_cpu_usage_seconds_total { id =~ "/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/kubepods.slice" } and on (node) kube_node_role{ role = "worker",role != "infra" } ) by ( id ) | ||
metricName: cgroupCPUSeconds-Workers | ||
instant: true | ||
|
||
- query: sum ( container_cpu_usage_seconds_total { id =~ "/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/kubepods.slice" } and on (node) kube_node_role{ role = "master" } ) by ( id ) | ||
metricName: cgroupCPUSeconds-Masters | ||
instant: true | ||
|
||
- query: sum ( container_cpu_usage_seconds_total { id =~ "/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/kubepods.slice" } and on (node) kube_node_role{ role = "infra" } ) by ( id ) | ||
metricName: cgroupCPUSeconds-Infra | ||
instant: true | ||
|
||
- query: sum( container_cpu_usage_seconds_total{container!~"POD|",namespace=~"openshift-.*"} ) by (namespace) | ||
metricName: cgroupCPUSeconds-namespaces | ||
instant: true |
Oops, something went wrong.